diff --git a/lms/djangoapps/courseware/management/commands/dump_to_neo4j.py b/lms/djangoapps/courseware/management/commands/dump_to_neo4j.py new file mode 100644 index 0000000000..cf5fc8d3b9 --- /dev/null +++ b/lms/djangoapps/courseware/management/commands/dump_to_neo4j.py @@ -0,0 +1,211 @@ +""" +This file contains a management command for exporting the modulestore to +neo4j, a graph database. +""" +from __future__ import unicode_literals + +import logging + +from django.conf import settings +from django.core.management.base import BaseCommand, CommandError +from django.utils import six +from py2neo import Graph, Node, Relationship, authenticate +from py2neo.compat import integer, string, unicode as neo4j_unicode +from request_cache.middleware import RequestCache +from xmodule.modulestore.django import modulestore + +log = logging.getLogger(__name__) + +# When testing locally, neo4j's bolt logger was noisy, so we'll only have it +# emit logs if there's an error. +bolt_log = logging.getLogger('neo4j.bolt') # pylint: disable=invalid-name +bolt_log.setLevel(logging.ERROR) + +ITERABLE_NEO4J_TYPES = (tuple, list, set, frozenset) +PRIMITIVE_NEO4J_TYPES = (integer, string, neo4j_unicode, float, bool) + + +class ModuleStoreSerializer(object): + """ + Class with functionality to serialize a modulestore into subgraphs, + one graph per course. + """ + def __init__(self): + self.all_courses = modulestore().get_course_summaries() + + @staticmethod + def serialize_item(item): + """ + Args: + item: an XBlock + + Returns: + fields: a dictionary of an XBlock's field names and values + label: the name of the XBlock's type (i.e. 'course' + or 'problem') + """ + # convert all fields to a dict and filter out parent and children field + fields = dict( + (field, field_value.read_from(item)) + for (field, field_value) in six.iteritems(item.fields) + if field not in ['parent', 'children'] + ) + + course_key = item.scope_ids.usage_id.course_key + + # set or reset some defaults + fields['edited_on'] = six.text_type(getattr(item, 'edited_on', '')) + fields['display_name'] = item.display_name_with_default + fields['org'] = course_key.org + fields['course'] = course_key.course + fields['run'] = course_key.run + fields['course_key'] = six.text_type(course_key) + + label = item.scope_ids.block_type + + # prune some fields + if label == 'course': + if 'checklists' in fields: + del fields['checklists'] + + return fields, label + + def serialize_course(self, course_id): + """ + Args: + course_id: CourseKey of the course we want to serialize + + Returns: + nodes: a list of py2neo Node objects + relationships: a list of py2neo Relationships objects + + Serializes a course into Nodes and Relationships + """ + # create a location to node mapping we'll need later for + # writing relationships + location_to_node = {} + items = modulestore().get_items(course_id) + + # create nodes + nodes = [] + for item in items: + fields, label = self.serialize_item(item) + + for field_name, value in six.iteritems(fields): + fields[field_name] = self.coerce_types(value) + + node = Node(label, 'item', **fields) + nodes.append(node) + location_to_node[item.location] = node + + # create relationships + relationships = [] + for item in items: + for child_loc in item.get_children(): + parent_node = location_to_node.get(item.location) + child_node = location_to_node.get(child_loc.location) + if parent_node is not None and child_node is not None: + relationship = Relationship(parent_node, "PARENT_OF", child_node) + relationships.append(relationship) + + return nodes, relationships + + @staticmethod + def coerce_types(value): + """ + Args: + value: the value of an xblock's field + + Returns: either the value, a text version of the value, or, if the + value is iterable, the value with each element being converted to text + """ + + coerced_value = value + if isinstance(value, ITERABLE_NEO4J_TYPES): + coerced_value = [] + for element in value: + coerced_value.append(six.text_type(element)) + # convert coerced_value back to its original type + coerced_value = type(value)(coerced_value) + + # if it's not one of the types that neo4j accepts, + # just convert it to text + elif not isinstance(value, PRIMITIVE_NEO4J_TYPES): + coerced_value = six.text_type(value) + + return coerced_value + + +class Command(BaseCommand): + """ + Command to dump modulestore data to neo4j + """ + + @staticmethod + def add_to_transaction(neo4j_entities, transaction): + """ + Args: + neo4j_entities: a list of Nodes or Relationships + transaction: a neo4j transaction + """ + for entity in neo4j_entities: + transaction.create(entity) + + def handle(self, *args, **options): # pylint: disable=unused-argument + """ + Iterates through each course, serializes them into graphs, and saves + those graphs to neo4j. + """ + # first, make sure that there's a valid neo4j configuration + if settings.NEO4J_CONFIG is None: + raise CommandError( + "No neo4j configuration (NEO4J_CONFIG) defined in lms.auth.json." + ) + + auth_params = ["{host}:{https_port}", "{user}", "{password}"] + authenticate(*[param.format(**settings.NEO4J_CONFIG) for param in auth_params]) + + graph = Graph(**settings.NEO4J_CONFIG) + + mss = ModuleStoreSerializer() + + total_number_of_courses = len(mss.all_courses) + + for index, course in enumerate(mss.all_courses): + # first, clear the request cache to prevent memory leaks + RequestCache.clear_request_cache() + + log.info( + "Now exporting %s to neo4j: course %d of %d total courses", + course.id, + index + 1, + total_number_of_courses + ) + nodes, relationships = mss.serialize_course(course.id) + log.info( + "%d nodes and %d relationships in %s", + len(nodes), + len(relationships), + course.id + ) + + transaction = graph.begin() + try: + # first, delete existing course + transaction.run( + "MATCH (n:item) WHERE n.course_key='{}' DETACH DELETE n".format( + six.text_type(course.id) + ) + ) + + # now, re-add it + self.add_to_transaction(nodes, transaction) + self.add_to_transaction(relationships, transaction) + transaction.commit() + + except Exception: # pylint: disable=broad-except + log.exception( + "Error trying to dump course %s to neo4j, rolling back", + six.text_type(course.id) + ) + transaction.rollback() diff --git a/lms/djangoapps/courseware/management/commands/tests/test_dump_to_neo4j.py b/lms/djangoapps/courseware/management/commands/tests/test_dump_to_neo4j.py new file mode 100644 index 0000000000..0d7aea4d59 --- /dev/null +++ b/lms/djangoapps/courseware/management/commands/tests/test_dump_to_neo4j.py @@ -0,0 +1,154 @@ +# coding=utf-8 +""" +Tests for the dump_to_neo4j management command. +""" +from __future__ import unicode_literals + +import ddt +import mock +from courseware.management.commands.dump_to_neo4j import ( + ModuleStoreSerializer, + ITERABLE_NEO4J_TYPES, +) +from django.core.management import call_command +from django.core.management.base import CommandError +from django.utils import six +from xmodule.modulestore.tests.django_utils import SharedModuleStoreTestCase +from xmodule.modulestore.tests.factories import CourseFactory, ItemFactory + + +class TestDumpToNeo4jCommandBase(SharedModuleStoreTestCase): + """ + Base class for the test suites in this file. Sets up a couple courses. + """ + @classmethod + def setUpClass(cls): + super(TestDumpToNeo4jCommandBase, cls).setUpClass() + cls.course = CourseFactory.create() + cls.chapter = ItemFactory.create(parent=cls.course, category='chapter') + cls.sequential = ItemFactory.create(parent=cls.chapter, category='sequential') + cls.vertical = ItemFactory.create(parent=cls.sequential, category='vertical') + cls.html = ItemFactory.create(parent=cls.vertical, category='html') + cls.problem = ItemFactory.create(parent=cls.vertical, category='problem') + cls.video = ItemFactory.create(parent=cls.vertical, category='video') + cls.video2 = ItemFactory.create(parent=cls.vertical, category='video') + + cls.course2 = CourseFactory.create() + + +class TestDumpToNeo4jCommand(TestDumpToNeo4jCommandBase): + """ + Tests for the dump to neo4j management command + """ + @mock.patch('courseware.management.commands.dump_to_neo4j.Graph') + def test_dump_to_neo4j(self, mock_graph_class): + """ + Tests the dump_to_neo4j management command works against a mock + py2neo Graph + """ + mock_graph = mock_graph_class.return_value + mock_transaction = mock.Mock() + mock_graph.begin.return_value = mock_transaction + + call_command('dump_to_neo4j') + + self.assertEqual(mock_graph.begin.call_count, 2) + self.assertEqual(mock_transaction.commit.call_count, 2) + self.assertEqual(mock_transaction.rollback.call_count, 0) + + # 7 nodes + 9 relationships from the first course + # 2 nodes and no relationships from the second + self.assertEqual(mock_transaction.create.call_count, 18) + self.assertEqual(mock_transaction.run.call_count, 2) + + @mock.patch('courseware.management.commands.dump_to_neo4j.Graph') + def test_dump_to_neo4j_rollback(self, mock_graph_class): + """ + Tests that the management command handles the case where there's + an exception trying to write to the neo4j database. + """ + mock_graph = mock_graph_class.return_value + mock_transaction = mock.Mock() + mock_graph.begin.return_value = mock_transaction + mock_transaction.run.side_effect = ValueError('Something went wrong!') + + call_command('dump_to_neo4j') + + self.assertEqual(mock_graph.begin.call_count, 2) + self.assertEqual(mock_transaction.commit.call_count, 0) + self.assertEqual(mock_transaction.rollback.call_count, 2) + + @mock.patch('django.conf.settings.NEO4J_CONFIG', None) + def test_dump_to_neo4j_no_config(self): + """ + Tests that the command errors out if there isn't a configuration + file found + """ + with self.assertRaises(CommandError): + call_command('dump_to_neo4j') + + +@ddt.ddt +class TestModuleStoreSerializer(TestDumpToNeo4jCommandBase): + """ + Tests for the ModuleStoreSerializer + """ + def setUp(self): + super(TestModuleStoreSerializer, self).setUp() + self.modulestore_serializer = ModuleStoreSerializer() + + def test_serialize_item(self): + """ + Tests the serialize_item method. + """ + fields, label = self.modulestore_serializer.serialize_item(self.course) + self.assertEqual(label, "course") + self.assertIn("edited_on", fields.keys()) + self.assertIn("display_name", fields.keys()) + self.assertIn("org", fields.keys()) + self.assertIn("course", fields.keys()) + self.assertIn("run", fields.keys()) + self.assertIn("course_key", fields.keys()) + self.assertNotIn("checklist", fields.keys()) + + def test_serialize_course(self): + """ + Tests the serialize_course method. + """ + nodes, relationships = self.modulestore_serializer.serialize_course( + self.course.id + ) + self.assertEqual(len(nodes), 9) + self.assertEqual(len(relationships), 7) + + @ddt.data(*ITERABLE_NEO4J_TYPES) + def test_coerce_types_iterable(self, iterable_type): + """ + Tests the coerce_types helper method for iterable types + """ + example_iterable = iterable_type([object, object, object]) + + # each element in the iterable is not unicode: + self.assertFalse(any(isinstance(tab, six.text_type) for tab in example_iterable)) + # but after they are coerced, they are: + coerced = self.modulestore_serializer.coerce_types(example_iterable) + self.assertTrue(all(isinstance(tab, six.text_type) for tab in coerced)) + # finally, make sure we haven't changed the type: + self.assertEqual(type(coerced), iterable_type) + + @ddt.data( + (1, 1), + (object, ""), + (1.5, 1.5), + ("úñîçø∂é", "úñîçø∂é"), + (b"plain string", b"plain string"), + (True, True), + (None, "None"), + ) + @ddt.unpack + def test_coerce_types_base(self, original_value, coerced_expected): + """ + Tests the coerce_types helper for the neo4j base types + """ + coerced_value = self.modulestore_serializer.coerce_types(original_value) + self.assertEqual(coerced_value, coerced_expected) diff --git a/lms/envs/aws.py b/lms/envs/aws.py index 3b03ca6c98..11268829c9 100644 --- a/lms/envs/aws.py +++ b/lms/envs/aws.py @@ -863,3 +863,6 @@ API_ACCESS_FROM_EMAIL = ENV_TOKENS.get('API_ACCESS_FROM_EMAIL') APP_UPGRADE_CACHE_TIMEOUT = ENV_TOKENS.get('APP_UPGRADE_CACHE_TIMEOUT', APP_UPGRADE_CACHE_TIMEOUT) AFFILIATE_COOKIE_NAME = ENV_TOKENS.get('AFFILIATE_COOKIE_NAME', AFFILIATE_COOKIE_NAME) + +############## Settings for Neo4j ############################ +NEO4J_CONFIG = AUTH_TOKENS.get('NEO4J_CONFIG') diff --git a/lms/envs/common.py b/lms/envs/common.py index a24d47866f..78bc1b6e2b 100644 --- a/lms/envs/common.py +++ b/lms/envs/common.py @@ -2951,3 +2951,8 @@ AFFILIATE_COOKIE_NAME = 'affiliate_id' # The cache is cleared when Redirect models are saved/deleted REDIRECT_CACHE_TIMEOUT = None # The length of time we cache Redirect model data REDIRECT_CACHE_KEY_PREFIX = 'redirects' + +############## Settings for Neo4j ############################ + +# This should be set in configuration +NEO4J_CONFIG = None diff --git a/lms/envs/test.py b/lms/envs/test.py index 9dd30f3cb7..d6388c3302 100644 --- a/lms/envs/test.py +++ b/lms/envs/test.py @@ -590,3 +590,13 @@ COURSE_CATALOG_API_URL = 'https://catalog.example.com/api/v1' COMPREHENSIVE_THEME_DIRS = [REPO_ROOT / "themes", REPO_ROOT / "common/test"] LMS_ROOT_URL = "http://localhost:8000" + +# Test configuration for neo4j +NEO4J_CONFIG = { + 'bolt': True, + 'password': 'password', + 'user': 'neo4j', + 'https_port': 7473, + 'host': 'localhost', + 'secure': True, +} diff --git a/requirements/edx/base.txt b/requirements/edx/base.txt index 047fff7c7b..394b7428f2 100644 --- a/requirements/edx/base.txt +++ b/requirements/edx/base.txt @@ -186,3 +186,7 @@ sailthru-client==2.2.3 # Release utils for the edx release pipeline edx-django-release-util==0.1.0 + +# Used to communicate with Neo4j, which is used internally for +# modulestore inspection +py2neo==3.1.2