From 5ef1e0805052e52127c611110d26db447e120893 Mon Sep 17 00:00:00 2001 From: David Ormsbee Date: Fri, 13 Jan 2017 17:31:16 -0500 Subject: [PATCH 1/2] Disable student state writes for crawlers. When crawlers like edX-downloader make requests on courseware, they are often concurrently loading many units in the same sequence. This causes contention for the rows in courseware_studentmodule that store the student's state for various XBlocks/XModules, most notably for the sequence, chapter, and course -- all of which record and update user position information when loaded. It would be nice if we could actually remove these writes altogether and come up with a cleaner way of keeping track of the user's position. In general, GETs should be side-effect free. However, any such change would break backwards compatibility, and would require close coordination with research teams to make sure they weren't negatively affected. This commit identifies crawlers by user agent (CrawlersConfig model), and blocks student state writes if a crawler is detected. FieldDataCache writes simply become no-ops. It doesn't actually alter the rendering of the courseware in any way -- the main impact is that the blocks won't record your most recent position, which is meaningless for crawlers anyway. This can also be used as a building block for other policy we want to define around crawlers. We just have to be mindful that this only works with "nice" crawlers who are honest in their user agents, and that significantly more sophisticated (and costly) measures would be necessary to prevent crawlers that try to be even trivially sneaky. [PERF-403] --- lms/djangoapps/courseware/model_data.py | 12 ++- lms/djangoapps/courseware/module_render.py | 4 +- lms/djangoapps/courseware/tests/test_views.py | 83 ++++++++++++++++++- lms/djangoapps/courseware/views/index.py | 15 ++-- lms/envs/common.py | 3 + openedx/core/djangoapps/crawlers/__init__.py | 0 openedx/core/djangoapps/crawlers/admin.py | 7 ++ .../crawlers/migrations/0001_initial.py | 30 +++++++ .../crawlers/migrations/__init__.py | 0 openedx/core/djangoapps/crawlers/models.py | 45 ++++++++++ openedx/core/djangoapps/crawlers/tests.py | 0 11 files changed, 189 insertions(+), 10 deletions(-) create mode 100644 openedx/core/djangoapps/crawlers/__init__.py create mode 100644 openedx/core/djangoapps/crawlers/admin.py create mode 100644 openedx/core/djangoapps/crawlers/migrations/0001_initial.py create mode 100644 openedx/core/djangoapps/crawlers/migrations/__init__.py create mode 100644 openedx/core/djangoapps/crawlers/models.py create mode 100644 openedx/core/djangoapps/crawlers/tests.py diff --git a/lms/djangoapps/courseware/model_data.py b/lms/djangoapps/courseware/model_data.py index fe2e88c89e..aa11c0d2aa 100644 --- a/lms/djangoapps/courseware/model_data.py +++ b/lms/djangoapps/courseware/model_data.py @@ -687,7 +687,7 @@ class FieldDataCache(object): A cache of django model objects needed to supply the data for a module and its descendants """ - def __init__(self, descriptors, course_id, user, select_for_update=False, asides=None): + def __init__(self, descriptors, course_id, user, select_for_update=False, asides=None, read_only=False): """ Find any courseware.models objects that are needed by any descriptor in descriptors. Attempts to minimize the number of queries to the database. @@ -700,6 +700,7 @@ class FieldDataCache(object): user: The user for which to cache data select_for_update: Ignored asides: The list of aside types to load, or None to prefetch no asides. + read_only: We should not perform writes (they become a no-op). """ if asides is None: self.asides = [] @@ -709,6 +710,7 @@ class FieldDataCache(object): assert isinstance(course_id, CourseKey) self.course_id = course_id self.user = user + self.read_only = read_only self.cache = { Scope.user_state: UserStateCache( @@ -783,7 +785,7 @@ class FieldDataCache(object): @classmethod def cache_for_descriptor_descendents(cls, course_id, user, descriptor, depth=None, descriptor_filter=lambda descriptor: True, - select_for_update=False, asides=None): + select_for_update=False, asides=None, read_only=False): """ course_id: the course in the context of which we want StudentModules. user: the django user for whom to load modules. @@ -794,7 +796,7 @@ class FieldDataCache(object): should be cached select_for_update: Ignored """ - cache = FieldDataCache([], course_id, user, select_for_update, asides=asides) + cache = FieldDataCache([], course_id, user, select_for_update, asides=asides, read_only=read_only) cache.add_descriptor_descendents(descriptor, depth, descriptor_filter) return cache @@ -840,6 +842,8 @@ class FieldDataCache(object): kv_dict (dict): dict mapping from `DjangoKeyValueStore.Key`s to field values Raises: DatabaseError if any fields fail to save """ + if self.read_only: + return saved_fields = [] by_scope = defaultdict(dict) @@ -875,6 +879,8 @@ class FieldDataCache(object): Raises: KeyError if key isn't found in the cache """ + if self.read_only: + return if key.scope.user == UserScope.ONE and not self.user.is_anonymous(): # If we're getting user data, we expect that the key matches the diff --git a/lms/djangoapps/courseware/module_render.py b/lms/djangoapps/courseware/module_render.py index 77bbff227d..33765384d0 100644 --- a/lms/djangoapps/courseware/module_render.py +++ b/lms/djangoapps/courseware/module_render.py @@ -49,6 +49,7 @@ from lms.djangoapps.lms_xblock.models import XBlockAsidesConfig from lms.djangoapps.lms_xblock.runtime import LmsModuleSystem from lms.djangoapps.verify_student.services import VerificationService, ReverificationService from openedx.core.djangoapps.bookmarks.services import BookmarksService +from openedx.core.djangoapps.crawlers.models import CrawlersConfig from openedx.core.djangoapps.credit.services import CreditService from openedx.core.djangoapps.util.user_utils import SystemUser from openedx.core.lib.xblock_utils import ( @@ -917,7 +918,8 @@ def get_module_by_usage_id(request, course_id, usage_id, disable_staff_debug_inf field_data_cache = FieldDataCache.cache_for_descriptor_descendents( course_id, user, - descriptor + descriptor, + read_only=CrawlersConfig.is_crawler(request), ) instance = get_module_for_descriptor( user, diff --git a/lms/djangoapps/courseware/tests/test_views.py b/lms/djangoapps/courseware/tests/test_views.py index 7ee98fbcb5..6f2c7b1682 100644 --- a/lms/djangoapps/courseware/tests/test_views.py +++ b/lms/djangoapps/courseware/tests/test_views.py @@ -50,6 +50,7 @@ from lms.djangoapps.commerce.utils import EcommerceService # pylint: disable=im from milestones.tests.utils import MilestonesTestCaseMixin from openedx.core.djangoapps.self_paced.models import SelfPacedConfiguration from openedx.core.lib.gating import api as gating_api +from openedx.core.djangoapps.crawlers.models import CrawlersConfig from student.models import CourseEnrollment from student.tests.factories import AdminFactory, UserFactory, CourseEnrollmentFactory from util.tests.test_date_utils import fake_ugettext, fake_pgettext @@ -58,7 +59,7 @@ from util.views import ensure_valid_course_key from xmodule.modulestore import ModuleStoreEnum from xmodule.modulestore.django import modulestore from xmodule.modulestore.tests.django_utils import TEST_DATA_MIXED_MODULESTORE -from xmodule.modulestore.tests.django_utils import ModuleStoreTestCase +from xmodule.modulestore.tests.django_utils import ModuleStoreTestCase, SharedModuleStoreTestCase from xmodule.modulestore.tests.factories import CourseFactory, ItemFactory, check_mongo_calls from openedx.core.djangoapps.credit.api import set_credit_requirements from openedx.core.djangoapps.credit.models import CreditCourse, CreditProvider @@ -2065,3 +2066,83 @@ class TestRenderXBlockSelfPaced(TestRenderXBlock): def course_options(self): return {'self_paced': True} + + +class TestIndexViewCrawlerStudentStateWrites(SharedModuleStoreTestCase): + """ + Ensure that courseware index requests do not trigger student state writes. + + This is to prevent locking issues that have caused latency spikes in the + courseware_studentmodule table when concurrent requests each try to update + the same rows for sequence, section, and course positions. + """ + @classmethod + def setUpClass(cls): + """Set up the simplest course possible.""" + # setUpClassAndTestData() already calls setUpClass on SharedModuleStoreTestCase + # pylint: disable=super-method-not-called + with super(TestIndexViewCrawlerStudentStateWrites, cls).setUpClassAndTestData(): + cls.course = CourseFactory.create() + with cls.store.bulk_operations(cls.course.id): + cls.chapter = ItemFactory.create(category='chapter', parent_location=cls.course.location) + cls.section = ItemFactory.create(category='sequential', parent_location=cls.chapter.location) + cls.vertical = ItemFactory.create(category='vertical', parent_location=cls.section.location) + + @classmethod + def setUpTestData(cls): + """Set up and enroll our fake user in the course.""" + cls.password = 'test' + cls.user = UserFactory(password=cls.password) + CourseEnrollment.enroll(cls.user, cls.course.id) + + def setUp(self): + """Do the client login.""" + super(TestIndexViewCrawlerStudentStateWrites, self).setUp() + self.client.login(username=self.user.username, password=self.password) + + def test_write_by_default(self): + """By default, always write student state, regardless of user agent.""" + with patch('courseware.model_data.UserStateCache.set_many') as patched_state_client_set_many: + # Simulate someone using Chrome + self._load_courseware('Mozilla/5.0 AppleWebKit/537.36') + self.assertTrue(patched_state_client_set_many.called) + patched_state_client_set_many.reset_mock() + + # Common crawler user agent + self._load_courseware('edX-downloader/0.1') + self.assertTrue(patched_state_client_set_many.called) + + def test_writes_with_config(self): + """Test state writes (or lack thereof) based on config values.""" + CrawlersConfig.objects.create(known_user_agents='edX-downloader,crawler_foo', enabled=True) + with patch('courseware.model_data.UserStateCache.set_many') as patched_state_client_set_many: + # Exact matching of crawler user agent + self._load_courseware('crawler_foo') + self.assertFalse(patched_state_client_set_many.called) + + # Partial matching of crawler user agent + self._load_courseware('edX-downloader/0.1') + self.assertFalse(patched_state_client_set_many.called) + + # Simulate an actual browser hitting it (we should write) + self._load_courseware('Mozilla/5.0 AppleWebKit/537.36') + self.assertTrue(patched_state_client_set_many.called) + + # Disabling the crawlers config should revert us to default behavior + CrawlersConfig.objects.create(enabled=False) + self.test_write_by_default() + + def _load_courseware(self, user_agent): + """Helper to load the actual courseware page.""" + url = reverse( + 'courseware_section', + kwargs={ + 'course_id': unicode(self.course.id), + 'chapter': unicode(self.chapter.location.name), + 'section': unicode(self.section.location.name), + } + ) + response = self.client.get(url, HTTP_USER_AGENT=user_agent) + # Make sure we get back an actual 200, and aren't redirected because we + # messed up the setup somehow (e.g. didn't enroll properly) + self.assertEqual(response.status_code, 200) diff --git a/lms/djangoapps/courseware/views/index.py b/lms/djangoapps/courseware/views/index.py index 2744cb8433..cffdee677d 100644 --- a/lms/djangoapps/courseware/views/index.py +++ b/lms/djangoapps/courseware/views/index.py @@ -26,6 +26,7 @@ from xblock.fragment import Fragment from opaque_keys.edx.keys import CourseKey from openedx.core.djangoapps.lang_pref import LANGUAGE_KEY from openedx.core.djangoapps.user_api.preferences.api import get_user_preference +from openedx.core.djangoapps.crawlers.models import CrawlersConfig from shoppingcart.models import CourseRegistrationCode from student.models import CourseEnrollment from student.views import is_course_blocked @@ -101,7 +102,7 @@ class CoursewareIndex(View): self.course = get_course_with_access(request.user, 'load', self.course_key, depth=CONTENT_DEPTH) self.is_staff = has_access(request.user, 'staff', self.course) self._setup_masquerade_for_effective_user() - return self._get() + return self._get(request) except Redirect as redirect_error: return redirect(redirect_error.url) except UnicodeEncodeError: @@ -127,12 +128,12 @@ class CoursewareIndex(View): # Set the user in the request to the effective user. self.request.user = self.effective_user - def _get(self): + def _get(self, request): """ Render the index page. """ self._redirect_if_needed_to_access_course() - self._prefetch_and_bind_course() + self._prefetch_and_bind_course(request) if self.course.has_children_at_depth(CONTENT_DEPTH): self._reset_section_to_exam_if_required() @@ -324,13 +325,17 @@ class CoursewareIndex(View): if self.chapter: return self._find_block(self.chapter, self.section_url_name, 'section') - def _prefetch_and_bind_course(self): + def _prefetch_and_bind_course(self, request): """ Prefetches all descendant data for the requested section and sets up the runtime, which binds the request user to the section. """ self.field_data_cache = FieldDataCache.cache_for_descriptor_descendents( - self.course_key, self.effective_user, self.course, depth=CONTENT_DEPTH, + self.course_key, + self.effective_user, + self.course, + depth=CONTENT_DEPTH, + read_only=CrawlersConfig.is_crawler(request), ) self.course = get_module_for_descriptor( diff --git a/lms/envs/common.py b/lms/envs/common.py index 89874df437..f8504e55c5 100644 --- a/lms/envs/common.py +++ b/lms/envs/common.py @@ -2160,6 +2160,9 @@ INSTALLED_APPS = ( # Customized celery tasks, including persisting failed tasks so they can # be retried 'openedx.core.djangoapps.celery_utils', + + # Ability to detect and special-case crawler behavior + 'openedx.core.djangoapps.crawlers', ) # Migrations which are not in the standard module "migrations" diff --git a/openedx/core/djangoapps/crawlers/__init__.py b/openedx/core/djangoapps/crawlers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/openedx/core/djangoapps/crawlers/admin.py b/openedx/core/djangoapps/crawlers/admin.py new file mode 100644 index 0000000000..45085716e3 --- /dev/null +++ b/openedx/core/djangoapps/crawlers/admin.py @@ -0,0 +1,7 @@ +"""Admin panel for configuring which user agents we consider to be Crawlers.""" +from django.contrib import admin + +from config_models.admin import ConfigurationModelAdmin +from .models import CrawlersConfig + +admin.site.register(CrawlersConfig, ConfigurationModelAdmin) diff --git a/openedx/core/djangoapps/crawlers/migrations/0001_initial.py b/openedx/core/djangoapps/crawlers/migrations/0001_initial.py new file mode 100644 index 0000000000..0b95a9ec4b --- /dev/null +++ b/openedx/core/djangoapps/crawlers/migrations/0001_initial.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from django.db import migrations, models +import django.db.models.deletion +from django.conf import settings + + +class Migration(migrations.Migration): + + dependencies = [ + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.CreateModel( + name='CrawlersConfig', + fields=[ + ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), + ('change_date', models.DateTimeField(auto_now_add=True, verbose_name='Change date')), + ('enabled', models.BooleanField(default=False, verbose_name='Enabled')), + ('known_user_agents', models.TextField(default=b'edX-downloader', help_text=b'A comma-separated list of known crawler user agents.', blank=True)), + ('changed_by', models.ForeignKey(on_delete=django.db.models.deletion.PROTECT, editable=False, to=settings.AUTH_USER_MODEL, null=True, verbose_name='Changed by')), + ], + options={ + 'ordering': ('-change_date',), + 'abstract': False, + }, + ), + ] diff --git a/openedx/core/djangoapps/crawlers/migrations/__init__.py b/openedx/core/djangoapps/crawlers/migrations/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/openedx/core/djangoapps/crawlers/models.py b/openedx/core/djangoapps/crawlers/models.py new file mode 100644 index 0000000000..1cb7dcdef3 --- /dev/null +++ b/openedx/core/djangoapps/crawlers/models.py @@ -0,0 +1,45 @@ +""" +This module handles the detection of crawlers, so that we can handle them +appropriately in other parts of the code. +""" +from django.db import models + +from config_models.models import ConfigurationModel + + +class CrawlersConfig(ConfigurationModel): + """Configuration for the crawlers django app.""" + known_user_agents = models.TextField( + blank=True, + help_text="A comma-separated list of known crawler user agents.", + default='edX-downloader', + ) + + def __unicode__(self): + return u'CrawlersConfig("{}")'.format(self.known_user_agents) + + @classmethod + def is_crawler(cls, request): + """Determine if the request came from a crawler or not. + + This method is simplistic and only looks at the user agent header at the + moment, but could later be improved to be smarter about detection. + """ + current = cls.current() + if not current.enabled: + return False + + req_user_agent = request.META.get('HTTP_USER_AGENT') + crawler_agents = current.known_user_agents.split(",") + + # If there was no user agent detected or no crawler agents configured, + # then just return False. + if (not req_user_agent) or (not crawler_agents): + return False + + # We perform prefix matching of the crawler agent here so that we don't + # have to worry about version bumps. + return any( + req_user_agent.startswith(crawler_agent) + for crawler_agent in crawler_agents + ) diff --git a/openedx/core/djangoapps/crawlers/tests.py b/openedx/core/djangoapps/crawlers/tests.py new file mode 100644 index 0000000000..e69de29bb2 From 608fceb0b54cba1a571c479aa0bbab1caac4b260 Mon Sep 17 00:00:00 2001 From: David Ormsbee Date: Thu, 26 Jan 2017 09:36:01 -0500 Subject: [PATCH 2/2] Remove select_for_update option for FieldDataCache We've already been ignoring the param for a long time. This just removes it entirely. --- lms/djangoapps/courseware/model_data.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/lms/djangoapps/courseware/model_data.py b/lms/djangoapps/courseware/model_data.py index aa11c0d2aa..d450785063 100644 --- a/lms/djangoapps/courseware/model_data.py +++ b/lms/djangoapps/courseware/model_data.py @@ -687,7 +687,7 @@ class FieldDataCache(object): A cache of django model objects needed to supply the data for a module and its descendants """ - def __init__(self, descriptors, course_id, user, select_for_update=False, asides=None, read_only=False): + def __init__(self, descriptors, course_id, user, asides=None, read_only=False): """ Find any courseware.models objects that are needed by any descriptor in descriptors. Attempts to minimize the number of queries to the database. @@ -698,7 +698,6 @@ class FieldDataCache(object): descriptors: A list of XModuleDescriptors. course_id: The id of the current course user: The user for which to cache data - select_for_update: Ignored asides: The list of aside types to load, or None to prefetch no asides. read_only: We should not perform writes (they become a no-op). """ @@ -785,7 +784,7 @@ class FieldDataCache(object): @classmethod def cache_for_descriptor_descendents(cls, course_id, user, descriptor, depth=None, descriptor_filter=lambda descriptor: True, - select_for_update=False, asides=None, read_only=False): + asides=None, read_only=False): """ course_id: the course in the context of which we want StudentModules. user: the django user for whom to load modules. @@ -794,9 +793,8 @@ class FieldDataCache(object): the supplied descriptor. If depth is None, load all descendant StudentModules descriptor_filter is a function that accepts a descriptor and return whether the field data should be cached - select_for_update: Ignored """ - cache = FieldDataCache([], course_id, user, select_for_update, asides=asides, read_only=read_only) + cache = FieldDataCache([], course_id, user, asides=asides, read_only=read_only) cache.add_descriptor_descendents(descriptor, depth, descriptor_filter) return cache