diff --git a/cms/envs/common.py b/cms/envs/common.py index f3dd1f9a11..c6fe7ce170 100644 --- a/cms/envs/common.py +++ b/cms/envs/common.py @@ -932,6 +932,9 @@ INSTALLED_APPS = ( # Site configuration for theming and behavioral modification 'openedx.core.djangoapps.site_configuration', + # Ability to detect and special-case crawler behavior + 'openedx.core.djangoapps.crawlers', + # comment common 'django_comment_common', diff --git a/openedx/core/djangoapps/crawlers/models.py b/openedx/core/djangoapps/crawlers/models.py index c39c6ba5eb..b7c850b40e 100644 --- a/openedx/core/djangoapps/crawlers/models.py +++ b/openedx/core/djangoapps/crawlers/models.py @@ -2,6 +2,7 @@ This module handles the detection of crawlers, so that we can handle them appropriately in other parts of the code. """ +import six from config_models.models import ConfigurationModel from django.db import models @@ -39,6 +40,14 @@ class CrawlersConfig(ConfigurationModel): if (not req_user_agent) or (not crawler_agents): return False + # The crawler_agents list we pull from our model always has unicode objects, but the + # req_user_agent we get from HTTP headers ultimately comes to us via WSGI. That + # value is an ISO-8859-1 encoded byte string in Python 2.7 (and in the HTTP spec), but + # it will be a unicode str when we move to Python 3.x. This code should work under + # either version. + if isinstance(req_user_agent, six.binary_type): + crawler_agents = [crawler_agent.encode('iso-8859-1') for crawler_agent in crawler_agents] + # We perform prefix matching of the crawler agent here so that we don't # have to worry about version bumps. return any( diff --git a/openedx/core/djangoapps/crawlers/tests/__init__.py b/openedx/core/djangoapps/crawlers/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/openedx/core/djangoapps/crawlers/tests/test_models.py b/openedx/core/djangoapps/crawlers/tests/test_models.py new file mode 100644 index 0000000000..0e60f398d4 --- /dev/null +++ b/openedx/core/djangoapps/crawlers/tests/test_models.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- +""" +Tests that the request came from a crawler or not. +""" +import ddt +from django.test import TestCase +from django.http import HttpRequest +from ..models import CrawlersConfig + + +@ddt.ddt +class CrawlersConfigTest(TestCase): + + def setUp(self): + super(CrawlersConfigTest, self).setUp() + CrawlersConfig(known_user_agents='edX-downloader,crawler_foo', enabled=True).save() + + @ddt.data( + "Mozilla/5.0 (Linux; Android 5.1; Nexus 5 Build/LMY47I; wv) AppleWebKit/537.36 (KHTML, like Gecko) " + "Version/4.0 Chrome/47.0.2526.100 Mobile Safari/537.36 edX/org.edx.mobile/2.0.0", + "Le Héros des Deux Mondes", + ) + def test_req_user_agent_is_not_crawler(self, req_user_agent): + """ + verify that the request did not come from a crawler. + """ + fake_request = HttpRequest() + fake_request.META['HTTP_USER_AGENT'] = req_user_agent + self.assertFalse(CrawlersConfig.is_crawler(fake_request)) + + @ddt.data( + u"edX-downloader", + "crawler_foo".encode("utf-8") + ) + def test_req_user_agent_is_crawler(self, req_user_agent): + """ + verify that the request came from a crawler. + """ + fake_request = HttpRequest() + fake_request.META['HTTP_USER_AGENT'] = req_user_agent + self.assertTrue(CrawlersConfig.is_crawler(fake_request))