64 lines
2.2 KiB
Python
64 lines
2.2 KiB
Python
"""
|
|
This module handles the detection of crawlers, so that we can handle them
|
|
appropriately in other parts of the code.
|
|
"""
|
|
|
|
|
|
from config_models.models import ConfigurationModel
|
|
from django.db import models
|
|
from django.utils.encoding import python_2_unicode_compatible
|
|
|
|
|
|
@python_2_unicode_compatible
|
|
class CrawlersConfig(ConfigurationModel):
|
|
"""
|
|
Configuration for the crawlers django app.
|
|
|
|
.. no_pii:
|
|
"""
|
|
class Meta:
|
|
app_label = "crawlers"
|
|
|
|
known_user_agents = models.TextField(
|
|
blank=True,
|
|
help_text="A comma-separated list of known crawler user agents.",
|
|
default='edX-downloader',
|
|
)
|
|
|
|
def __str__(self):
|
|
return f'CrawlersConfig("{self.known_user_agents}")'
|
|
|
|
@classmethod
|
|
def is_crawler(cls, request):
|
|
"""Determine if the request came from a crawler or not.
|
|
|
|
This method is simplistic and only looks at the user agent header at the
|
|
moment, but could later be improved to be smarter about detection.
|
|
"""
|
|
current = cls.current()
|
|
if not current.enabled:
|
|
return False
|
|
|
|
req_user_agent = request.META.get('HTTP_USER_AGENT')
|
|
crawler_agents = current.known_user_agents.split(",")
|
|
|
|
# If there was no user agent detected or no crawler agents configured,
|
|
# then just return False.
|
|
if (not req_user_agent) or (not crawler_agents):
|
|
return False
|
|
|
|
# The crawler_agents list we pull from our model always has unicode objects, but the
|
|
# req_user_agent we get from HTTP headers ultimately comes to us via WSGI. That
|
|
# value is an ISO-8859-1 encoded byte string in Python 2.7 (and in the HTTP spec), but
|
|
# it will be a unicode str when we move to Python 3.x. This code should work under
|
|
# either version.
|
|
if isinstance(req_user_agent, bytes):
|
|
crawler_agents = [crawler_agent.encode('iso-8859-1') for crawler_agent in crawler_agents]
|
|
|
|
# We perform prefix matching of the crawler agent here so that we don't
|
|
# have to worry about version bumps.
|
|
return any(
|
|
req_user_agent.startswith(crawler_agent)
|
|
for crawler_agent in crawler_agents
|
|
)
|