diff --git a/lms/djangoapps/courseware/model_data.py b/lms/djangoapps/courseware/model_data.py index fe2e88c89e..d450785063 100644 --- a/lms/djangoapps/courseware/model_data.py +++ b/lms/djangoapps/courseware/model_data.py @@ -687,7 +687,7 @@ class FieldDataCache(object): A cache of django model objects needed to supply the data for a module and its descendants """ - def __init__(self, descriptors, course_id, user, select_for_update=False, asides=None): + def __init__(self, descriptors, course_id, user, asides=None, read_only=False): """ Find any courseware.models objects that are needed by any descriptor in descriptors. Attempts to minimize the number of queries to the database. @@ -698,8 +698,8 @@ class FieldDataCache(object): descriptors: A list of XModuleDescriptors. course_id: The id of the current course user: The user for which to cache data - select_for_update: Ignored asides: The list of aside types to load, or None to prefetch no asides. + read_only: We should not perform writes (they become a no-op). """ if asides is None: self.asides = [] @@ -709,6 +709,7 @@ class FieldDataCache(object): assert isinstance(course_id, CourseKey) self.course_id = course_id self.user = user + self.read_only = read_only self.cache = { Scope.user_state: UserStateCache( @@ -783,7 +784,7 @@ class FieldDataCache(object): @classmethod def cache_for_descriptor_descendents(cls, course_id, user, descriptor, depth=None, descriptor_filter=lambda descriptor: True, - select_for_update=False, asides=None): + asides=None, read_only=False): """ course_id: the course in the context of which we want StudentModules. user: the django user for whom to load modules. @@ -792,9 +793,8 @@ class FieldDataCache(object): the supplied descriptor. If depth is None, load all descendant StudentModules descriptor_filter is a function that accepts a descriptor and return whether the field data should be cached - select_for_update: Ignored """ - cache = FieldDataCache([], course_id, user, select_for_update, asides=asides) + cache = FieldDataCache([], course_id, user, asides=asides, read_only=read_only) cache.add_descriptor_descendents(descriptor, depth, descriptor_filter) return cache @@ -840,6 +840,8 @@ class FieldDataCache(object): kv_dict (dict): dict mapping from `DjangoKeyValueStore.Key`s to field values Raises: DatabaseError if any fields fail to save """ + if self.read_only: + return saved_fields = [] by_scope = defaultdict(dict) @@ -875,6 +877,8 @@ class FieldDataCache(object): Raises: KeyError if key isn't found in the cache """ + if self.read_only: + return if key.scope.user == UserScope.ONE and not self.user.is_anonymous(): # If we're getting user data, we expect that the key matches the diff --git a/lms/djangoapps/courseware/module_render.py b/lms/djangoapps/courseware/module_render.py index 77bbff227d..33765384d0 100644 --- a/lms/djangoapps/courseware/module_render.py +++ b/lms/djangoapps/courseware/module_render.py @@ -49,6 +49,7 @@ from lms.djangoapps.lms_xblock.models import XBlockAsidesConfig from lms.djangoapps.lms_xblock.runtime import LmsModuleSystem from lms.djangoapps.verify_student.services import VerificationService, ReverificationService from openedx.core.djangoapps.bookmarks.services import BookmarksService +from openedx.core.djangoapps.crawlers.models import CrawlersConfig from openedx.core.djangoapps.credit.services import CreditService from openedx.core.djangoapps.util.user_utils import SystemUser from openedx.core.lib.xblock_utils import ( @@ -917,7 +918,8 @@ def get_module_by_usage_id(request, course_id, usage_id, disable_staff_debug_inf field_data_cache = FieldDataCache.cache_for_descriptor_descendents( course_id, user, - descriptor + descriptor, + read_only=CrawlersConfig.is_crawler(request), ) instance = get_module_for_descriptor( user, diff --git a/lms/djangoapps/courseware/tests/test_views.py b/lms/djangoapps/courseware/tests/test_views.py index 7ee98fbcb5..6f2c7b1682 100644 --- a/lms/djangoapps/courseware/tests/test_views.py +++ b/lms/djangoapps/courseware/tests/test_views.py @@ -50,6 +50,7 @@ from lms.djangoapps.commerce.utils import EcommerceService # pylint: disable=im from milestones.tests.utils import MilestonesTestCaseMixin from openedx.core.djangoapps.self_paced.models import SelfPacedConfiguration from openedx.core.lib.gating import api as gating_api +from openedx.core.djangoapps.crawlers.models import CrawlersConfig from student.models import CourseEnrollment from student.tests.factories import AdminFactory, UserFactory, CourseEnrollmentFactory from util.tests.test_date_utils import fake_ugettext, fake_pgettext @@ -58,7 +59,7 @@ from util.views import ensure_valid_course_key from xmodule.modulestore import ModuleStoreEnum from xmodule.modulestore.django import modulestore from xmodule.modulestore.tests.django_utils import TEST_DATA_MIXED_MODULESTORE -from xmodule.modulestore.tests.django_utils import ModuleStoreTestCase +from xmodule.modulestore.tests.django_utils import ModuleStoreTestCase, SharedModuleStoreTestCase from xmodule.modulestore.tests.factories import CourseFactory, ItemFactory, check_mongo_calls from openedx.core.djangoapps.credit.api import set_credit_requirements from openedx.core.djangoapps.credit.models import CreditCourse, CreditProvider @@ -2065,3 +2066,83 @@ class TestRenderXBlockSelfPaced(TestRenderXBlock): def course_options(self): return {'self_paced': True} + + +class TestIndexViewCrawlerStudentStateWrites(SharedModuleStoreTestCase): + """ + Ensure that courseware index requests do not trigger student state writes. + + This is to prevent locking issues that have caused latency spikes in the + courseware_studentmodule table when concurrent requests each try to update + the same rows for sequence, section, and course positions. + """ + @classmethod + def setUpClass(cls): + """Set up the simplest course possible.""" + # setUpClassAndTestData() already calls setUpClass on SharedModuleStoreTestCase + # pylint: disable=super-method-not-called + with super(TestIndexViewCrawlerStudentStateWrites, cls).setUpClassAndTestData(): + cls.course = CourseFactory.create() + with cls.store.bulk_operations(cls.course.id): + cls.chapter = ItemFactory.create(category='chapter', parent_location=cls.course.location) + cls.section = ItemFactory.create(category='sequential', parent_location=cls.chapter.location) + cls.vertical = ItemFactory.create(category='vertical', parent_location=cls.section.location) + + @classmethod + def setUpTestData(cls): + """Set up and enroll our fake user in the course.""" + cls.password = 'test' + cls.user = UserFactory(password=cls.password) + CourseEnrollment.enroll(cls.user, cls.course.id) + + def setUp(self): + """Do the client login.""" + super(TestIndexViewCrawlerStudentStateWrites, self).setUp() + self.client.login(username=self.user.username, password=self.password) + + def test_write_by_default(self): + """By default, always write student state, regardless of user agent.""" + with patch('courseware.model_data.UserStateCache.set_many') as patched_state_client_set_many: + # Simulate someone using Chrome + self._load_courseware('Mozilla/5.0 AppleWebKit/537.36') + self.assertTrue(patched_state_client_set_many.called) + patched_state_client_set_many.reset_mock() + + # Common crawler user agent + self._load_courseware('edX-downloader/0.1') + self.assertTrue(patched_state_client_set_many.called) + + def test_writes_with_config(self): + """Test state writes (or lack thereof) based on config values.""" + CrawlersConfig.objects.create(known_user_agents='edX-downloader,crawler_foo', enabled=True) + with patch('courseware.model_data.UserStateCache.set_many') as patched_state_client_set_many: + # Exact matching of crawler user agent + self._load_courseware('crawler_foo') + self.assertFalse(patched_state_client_set_many.called) + + # Partial matching of crawler user agent + self._load_courseware('edX-downloader/0.1') + self.assertFalse(patched_state_client_set_many.called) + + # Simulate an actual browser hitting it (we should write) + self._load_courseware('Mozilla/5.0 AppleWebKit/537.36') + self.assertTrue(patched_state_client_set_many.called) + + # Disabling the crawlers config should revert us to default behavior + CrawlersConfig.objects.create(enabled=False) + self.test_write_by_default() + + def _load_courseware(self, user_agent): + """Helper to load the actual courseware page.""" + url = reverse( + 'courseware_section', + kwargs={ + 'course_id': unicode(self.course.id), + 'chapter': unicode(self.chapter.location.name), + 'section': unicode(self.section.location.name), + } + ) + response = self.client.get(url, HTTP_USER_AGENT=user_agent) + # Make sure we get back an actual 200, and aren't redirected because we + # messed up the setup somehow (e.g. didn't enroll properly) + self.assertEqual(response.status_code, 200) diff --git a/lms/djangoapps/courseware/views/index.py b/lms/djangoapps/courseware/views/index.py index 2744cb8433..cffdee677d 100644 --- a/lms/djangoapps/courseware/views/index.py +++ b/lms/djangoapps/courseware/views/index.py @@ -26,6 +26,7 @@ from xblock.fragment import Fragment from opaque_keys.edx.keys import CourseKey from openedx.core.djangoapps.lang_pref import LANGUAGE_KEY from openedx.core.djangoapps.user_api.preferences.api import get_user_preference +from openedx.core.djangoapps.crawlers.models import CrawlersConfig from shoppingcart.models import CourseRegistrationCode from student.models import CourseEnrollment from student.views import is_course_blocked @@ -101,7 +102,7 @@ class CoursewareIndex(View): self.course = get_course_with_access(request.user, 'load', self.course_key, depth=CONTENT_DEPTH) self.is_staff = has_access(request.user, 'staff', self.course) self._setup_masquerade_for_effective_user() - return self._get() + return self._get(request) except Redirect as redirect_error: return redirect(redirect_error.url) except UnicodeEncodeError: @@ -127,12 +128,12 @@ class CoursewareIndex(View): # Set the user in the request to the effective user. self.request.user = self.effective_user - def _get(self): + def _get(self, request): """ Render the index page. """ self._redirect_if_needed_to_access_course() - self._prefetch_and_bind_course() + self._prefetch_and_bind_course(request) if self.course.has_children_at_depth(CONTENT_DEPTH): self._reset_section_to_exam_if_required() @@ -324,13 +325,17 @@ class CoursewareIndex(View): if self.chapter: return self._find_block(self.chapter, self.section_url_name, 'section') - def _prefetch_and_bind_course(self): + def _prefetch_and_bind_course(self, request): """ Prefetches all descendant data for the requested section and sets up the runtime, which binds the request user to the section. """ self.field_data_cache = FieldDataCache.cache_for_descriptor_descendents( - self.course_key, self.effective_user, self.course, depth=CONTENT_DEPTH, + self.course_key, + self.effective_user, + self.course, + depth=CONTENT_DEPTH, + read_only=CrawlersConfig.is_crawler(request), ) self.course = get_module_for_descriptor( diff --git a/lms/envs/common.py b/lms/envs/common.py index 89874df437..f8504e55c5 100644 --- a/lms/envs/common.py +++ b/lms/envs/common.py @@ -2160,6 +2160,9 @@ INSTALLED_APPS = ( # Customized celery tasks, including persisting failed tasks so they can # be retried 'openedx.core.djangoapps.celery_utils', + + # Ability to detect and special-case crawler behavior + 'openedx.core.djangoapps.crawlers', ) # Migrations which are not in the standard module "migrations" diff --git a/openedx/core/djangoapps/crawlers/__init__.py b/openedx/core/djangoapps/crawlers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/openedx/core/djangoapps/crawlers/admin.py b/openedx/core/djangoapps/crawlers/admin.py new file mode 100644 index 0000000000..45085716e3 --- /dev/null +++ b/openedx/core/djangoapps/crawlers/admin.py @@ -0,0 +1,7 @@ +"""Admin panel for configuring which user agents we consider to be Crawlers.""" +from django.contrib import admin + +from config_models.admin import ConfigurationModelAdmin +from .models import CrawlersConfig + +admin.site.register(CrawlersConfig, ConfigurationModelAdmin) diff --git a/openedx/core/djangoapps/crawlers/migrations/0001_initial.py b/openedx/core/djangoapps/crawlers/migrations/0001_initial.py new file mode 100644 index 0000000000..0b95a9ec4b --- /dev/null +++ b/openedx/core/djangoapps/crawlers/migrations/0001_initial.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from django.db import migrations, models +import django.db.models.deletion +from django.conf import settings + + +class Migration(migrations.Migration): + + dependencies = [ + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.CreateModel( + name='CrawlersConfig', + fields=[ + ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), + ('change_date', models.DateTimeField(auto_now_add=True, verbose_name='Change date')), + ('enabled', models.BooleanField(default=False, verbose_name='Enabled')), + ('known_user_agents', models.TextField(default=b'edX-downloader', help_text=b'A comma-separated list of known crawler user agents.', blank=True)), + ('changed_by', models.ForeignKey(on_delete=django.db.models.deletion.PROTECT, editable=False, to=settings.AUTH_USER_MODEL, null=True, verbose_name='Changed by')), + ], + options={ + 'ordering': ('-change_date',), + 'abstract': False, + }, + ), + ] diff --git a/openedx/core/djangoapps/crawlers/migrations/__init__.py b/openedx/core/djangoapps/crawlers/migrations/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/openedx/core/djangoapps/crawlers/models.py b/openedx/core/djangoapps/crawlers/models.py new file mode 100644 index 0000000000..1cb7dcdef3 --- /dev/null +++ b/openedx/core/djangoapps/crawlers/models.py @@ -0,0 +1,45 @@ +""" +This module handles the detection of crawlers, so that we can handle them +appropriately in other parts of the code. +""" +from django.db import models + +from config_models.models import ConfigurationModel + + +class CrawlersConfig(ConfigurationModel): + """Configuration for the crawlers django app.""" + known_user_agents = models.TextField( + blank=True, + help_text="A comma-separated list of known crawler user agents.", + default='edX-downloader', + ) + + def __unicode__(self): + return u'CrawlersConfig("{}")'.format(self.known_user_agents) + + @classmethod + def is_crawler(cls, request): + """Determine if the request came from a crawler or not. + + This method is simplistic and only looks at the user agent header at the + moment, but could later be improved to be smarter about detection. + """ + current = cls.current() + if not current.enabled: + return False + + req_user_agent = request.META.get('HTTP_USER_AGENT') + crawler_agents = current.known_user_agents.split(",") + + # If there was no user agent detected or no crawler agents configured, + # then just return False. + if (not req_user_agent) or (not crawler_agents): + return False + + # We perform prefix matching of the crawler agent here so that we don't + # have to worry about version bumps. + return any( + req_user_agent.startswith(crawler_agent) + for crawler_agent in crawler_agents + ) diff --git a/openedx/core/djangoapps/crawlers/tests.py b/openedx/core/djangoapps/crawlers/tests.py new file mode 100644 index 0000000000..e69de29bb2