From b615a8767f1bc1b0d7de5fc357d8815e58e467b1 Mon Sep 17 00:00:00 2001 From: Tim McCormack Date: Fri, 6 Sep 2024 14:42:32 +0000 Subject: [PATCH] feat: Convert contentserver to view permanently (drop middleware and flag) (#35420) Final planned portion of https://github.com/openedx/edx-platform/issues/34702 -- waffle flag and middleware are removed. Commits: 1. Feature change - Delete `content_server.use_view` waffle flag in favor of always using view - Delete the husk of `StaticContentServerMiddleware` and references to it - Update views module docstring 2. Refactor (move) - Move contentserver implementation into views.py 3. Lint cleanup - Fix import ordering (from refactor debris + amnestied lint) --- cms/envs/common.py | 1 - lms/envs/common.py | 1 - .../djangoapps/contentserver/middleware.py | 368 ----------------- .../contentserver/test/test_contentserver.py | 19 +- .../core/djangoapps/contentserver/views.py | 371 ++++++++++++++++-- 5 files changed, 348 insertions(+), 412 deletions(-) delete mode 100644 openedx/core/djangoapps/contentserver/middleware.py diff --git a/cms/envs/common.py b/cms/envs/common.py index 8daa08aeb1..34dd8503f3 100644 --- a/cms/envs/common.py +++ b/cms/envs/common.py @@ -949,7 +949,6 @@ MIDDLEWARE = [ 'openedx.core.djangoapps.cache_toolbox.middleware.CacheBackedAuthenticationMiddleware', 'common.djangoapps.student.middleware.UserStandingMiddleware', - 'openedx.core.djangoapps.contentserver.middleware.StaticContentServerMiddleware', 'django.contrib.messages.middleware.MessageMiddleware', 'common.djangoapps.track.middleware.TrackMiddleware', diff --git a/lms/envs/common.py b/lms/envs/common.py index 122ce7383b..3346692153 100644 --- a/lms/envs/common.py +++ b/lms/envs/common.py @@ -2295,7 +2295,6 @@ MIDDLEWARE = [ 'openedx.core.djangoapps.safe_sessions.middleware.EmailChangeMiddleware', 'common.djangoapps.student.middleware.UserStandingMiddleware', - 'openedx.core.djangoapps.contentserver.middleware.StaticContentServerMiddleware', # Adds user tags to tracking events # Must go before TrackMiddleware, to get the context set up diff --git a/openedx/core/djangoapps/contentserver/middleware.py b/openedx/core/djangoapps/contentserver/middleware.py deleted file mode 100644 index 6eb72da458..0000000000 --- a/openedx/core/djangoapps/contentserver/middleware.py +++ /dev/null @@ -1,368 +0,0 @@ -""" -Middleware to serve assets. -""" - - -import datetime -import logging - -from django.http import ( - HttpResponse, - HttpResponseBadRequest, - HttpResponseForbidden, - HttpResponseNotFound, - HttpResponseNotModified, - HttpResponsePermanentRedirect -) -from django.utils.deprecation import MiddlewareMixin -from edx_django_utils.monitoring import set_custom_attribute -from edx_toggles.toggles import WaffleFlag -from opaque_keys import InvalidKeyError -from opaque_keys.edx.locator import AssetLocator - -from openedx.core.djangoapps.header_control import force_header_for_response -from common.djangoapps.student.models import CourseEnrollment -from xmodule.assetstore.assetmgr import AssetManager # lint-amnesty, pylint: disable=wrong-import-order -from xmodule.contentstore.content import XASSET_LOCATION_TAG, StaticContent # lint-amnesty, pylint: disable=wrong-import-order -from xmodule.exceptions import NotFoundError # lint-amnesty, pylint: disable=wrong-import-order -from xmodule.modulestore import InvalidLocationError # lint-amnesty, pylint: disable=wrong-import-order -from xmodule.modulestore.exceptions import ItemNotFoundError # lint-amnesty, pylint: disable=wrong-import-order - -from .caching import get_cached_content, set_cached_content -from .models import CdnUserAgentsConfig, CourseAssetCacheTtlConfig - -log = logging.getLogger(__name__) - -# .. toggle_name: content_server.use_view -# .. toggle_implementation: WaffleFlag -# .. toggle_default: False -# .. toggle_description: Deployment flag for switching asset serving from a middleware -# to a view. Intended to be used once in each environment to test the cutover and -# ensure there are no errors or changes in behavior. Once this has been tested, -# the middleware can be fully converted to a view. -# .. toggle_use_cases: temporary -# .. toggle_creation_date: 2024-05-02 -# .. toggle_target_removal_date: 2024-07-01 -# .. toggle_tickets: https://github.com/openedx/edx-platform/issues/34702 -CONTENT_SERVER_USE_VIEW = WaffleFlag('content_server.use_view', module_name=__name__) - -# TODO: Soon as we have a reasonable way to serialize/deserialize AssetKeys, we need -# to change this file so instead of using course_id_partial, we're just using asset keys - -HTTP_DATE_FORMAT = "%a, %d %b %Y %H:%M:%S GMT" - - -class StaticContentServerMiddleware(MiddlewareMixin): - """ - Shim to maintain old pattern of serving course assets from a middleware. See views.py. - """ - def process_request(self, request): - """Intercept asset request or allow view to handle it, depending on config.""" - if CONTENT_SERVER_USE_VIEW.is_enabled(): - return - else: - set_custom_attribute('content_server.handled_by.middleware', True) - return IMPL.process_request(request) - - -class StaticContentServer(): - """ - Serves course assets to end users. Colloquially referred to as "contentserver." - """ - def is_asset_request(self, request): - """Determines whether the given request is an asset request""" - # Don't change this without updating urls.py! See docstring of views.py. - return ( - request.path.startswith('/' + XASSET_LOCATION_TAG + '/') - or - request.path.startswith('/' + AssetLocator.CANONICAL_NAMESPACE) - or - StaticContent.is_versioned_asset_path(request.path) - ) - - # pylint: disable=too-many-statements - def process_request(self, request): - """Process the given request""" - asset_path = request.path - - if self.is_asset_request(request): # lint-amnesty, pylint: disable=too-many-nested-blocks - # Make sure we can convert this request into a location. - if AssetLocator.CANONICAL_NAMESPACE in asset_path: - asset_path = asset_path.replace('block/', 'block@', 1) - - # If this is a versioned request, pull out the digest and chop off the prefix. - requested_digest = None - if StaticContent.is_versioned_asset_path(asset_path): - requested_digest, asset_path = StaticContent.parse_versioned_asset_path(asset_path) - - # Make sure we have a valid location value for this asset. - try: - loc = StaticContent.get_location_from_path(asset_path) - except (InvalidLocationError, InvalidKeyError): - return HttpResponseBadRequest() - - # Attempt to load the asset to make sure it exists, and grab the asset digest - # if we're able to load it. - actual_digest = None - try: - content = self.load_asset_from_location(loc) - actual_digest = getattr(content, "content_digest", None) - except (ItemNotFoundError, NotFoundError): - return HttpResponseNotFound() - - # If this was a versioned asset, and the digest doesn't match, redirect - # them to the actual version. - if requested_digest is not None and actual_digest is not None and (actual_digest != requested_digest): - actual_asset_path = StaticContent.add_version_to_asset_path(asset_path, actual_digest) - return HttpResponsePermanentRedirect(actual_asset_path) - - # Set the basics for this request. Make sure that the course key for this - # asset has a run, which old-style courses do not. Otherwise, this will - # explode when the key is serialized to be sent to NR. - safe_course_key = loc.course_key - if safe_course_key.run is None: - safe_course_key = safe_course_key.replace(run='only') - - set_custom_attribute('course_id', safe_course_key) - set_custom_attribute('org', loc.org) - set_custom_attribute('contentserver.path', loc.path) - - # Figure out if this is a CDN using us as the origin. - is_from_cdn = StaticContentServer.is_cdn_request(request) - set_custom_attribute('contentserver.from_cdn', is_from_cdn) - - # Check if this content is locked or not. - locked = self.is_content_locked(content) - set_custom_attribute('contentserver.locked', locked) - - # Check that user has access to the content. - if not self.is_user_authorized(request, content, loc): - return HttpResponseForbidden('Unauthorized') - - # Figure out if the client sent us a conditional request, and let them know - # if this asset has changed since then. - last_modified_at_str = content.last_modified_at.strftime(HTTP_DATE_FORMAT) - if 'HTTP_IF_MODIFIED_SINCE' in request.META: - if_modified_since = request.META['HTTP_IF_MODIFIED_SINCE'] - if if_modified_since == last_modified_at_str: - return HttpResponseNotModified() - - # *** File streaming within a byte range *** - # If a Range is provided, parse Range attribute of the request - # Add Content-Range in the response if Range is structurally correct - # Request -> Range attribute structure: "Range: bytes=first-[last]" - # Response -> Content-Range attribute structure: "Content-Range: bytes first-last/totalLength" - # http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35 - response = None - if request.META.get('HTTP_RANGE'): - # If we have a StaticContent, get a StaticContentStream. Can't manipulate the bytes otherwise. - if isinstance(content, StaticContent): - content = AssetManager.find(loc, as_stream=True) - - header_value = request.META['HTTP_RANGE'] - try: - unit, ranges = parse_range_header(header_value, content.length) - except ValueError as exception: - # If the header field is syntactically invalid it should be ignored. - log.exception( - "%s in Range header: %s for content: %s", - str(exception), header_value, str(loc) - ) - else: - if unit != 'bytes': - # Only accept ranges in bytes - log.warning("Unknown unit in Range header: %s for content: %s", header_value, str(loc)) - elif len(ranges) > 1: - # According to Http/1.1 spec content for multiple ranges should be sent as a multipart message. - # http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.16 - # But we send back the full content. - log.warning( - "More than 1 ranges in Range header: %s for content: %s", header_value, str(loc) - ) - else: - first, last = ranges[0] - - if 0 <= first <= last < content.length: - # If the byte range is satisfiable - response = HttpResponse(content.stream_data_in_range(first, last)) - response['Content-Range'] = 'bytes {first}-{last}/{length}'.format( - first=first, last=last, length=content.length - ) - response['Content-Length'] = str(last - first + 1) - response.status_code = 206 # Partial Content - - set_custom_attribute('contentserver.ranged', True) - else: - log.warning( - "Cannot satisfy ranges in Range header: %s for content: %s", - header_value, str(loc) - ) - return HttpResponse(status=416) # Requested Range Not Satisfiable - - # If Range header is absent or syntactically invalid return a full content response. - if response is None: - response = HttpResponse(content.stream_data()) - response['Content-Length'] = content.length - - set_custom_attribute('contentserver.content_len', content.length) - set_custom_attribute('contentserver.content_type', content.content_type) - - # "Accept-Ranges: bytes" tells the user that only "bytes" ranges are allowed - response['Accept-Ranges'] = 'bytes' - response['Content-Type'] = content.content_type - response['X-Frame-Options'] = 'ALLOW' - - # Set any caching headers, and do any response cleanup needed. Based on how much - # middleware we have in place, there's no easy way to use the built-in Django - # utilities and properly sanitize and modify a response to ensure that it is as - # cacheable as possible, which is why we do it ourselves. - self.set_caching_headers(content, response) - - return response - - def set_caching_headers(self, content, response): - """ - Sets caching headers based on whether or not the asset is locked. - """ - - is_locked = getattr(content, "locked", False) - - # We want to signal to the end user's browser, and to any intermediate proxies/caches, - # whether or not this asset is cacheable. If we have a TTL configured, we inform the - # caller, for unlocked assets, how long they are allowed to cache it. Since locked - # assets should be restricted to enrolled students, we simply send headers that - # indicate there should be no caching whatsoever. - cache_ttl = CourseAssetCacheTtlConfig.get_cache_ttl() - if cache_ttl > 0 and not is_locked: - set_custom_attribute('contentserver.cacheable', True) - - response['Expires'] = StaticContentServer.get_expiration_value(datetime.datetime.utcnow(), cache_ttl) - response['Cache-Control'] = "public, max-age={ttl}, s-maxage={ttl}".format(ttl=cache_ttl) - elif is_locked: - set_custom_attribute('contentserver.cacheable', False) - - response['Cache-Control'] = "private, no-cache, no-store" - - response['Last-Modified'] = content.last_modified_at.strftime(HTTP_DATE_FORMAT) - - # Force the Vary header to only vary responses on Origin, so that XHR and browser requests get cached - # separately and don't screw over one another. i.e. a browser request that doesn't send Origin, and - # caches a version of the response without CORS headers, in turn breaking XHR requests. - force_header_for_response(response, 'Vary', 'Origin') - - @staticmethod - def is_cdn_request(request): - """ - Attempts to determine whether or not the given request is coming from a CDN. - - Currently, this is a static check because edx.org only uses CloudFront, but may - be expanded in the future. - """ - cdn_user_agents = CdnUserAgentsConfig.get_cdn_user_agents() - user_agent = request.META.get('HTTP_USER_AGENT', '') - if user_agent in cdn_user_agents: - # This is a CDN request. - return True - - return False - - @staticmethod - def get_expiration_value(now, cache_ttl): - """Generates an RFC1123 datetime string based on a future offset.""" - expire_dt = now + datetime.timedelta(seconds=cache_ttl) - return expire_dt.strftime(HTTP_DATE_FORMAT) - - def is_content_locked(self, content): - """ - Determines whether or not the given content is locked. - """ - return bool(getattr(content, "locked", False)) - - def is_user_authorized(self, request, content, location): - """ - Determines whether or not the user for this request is authorized to view the given asset. - """ - if not self.is_content_locked(content): - return True - - if not hasattr(request, "user") or not request.user.is_authenticated: - return False - - if not request.user.is_staff: - deprecated = getattr(location, 'deprecated', False) - if deprecated and not CourseEnrollment.is_enrolled_by_partial(request.user, location.course_key): - return False - if not deprecated and not CourseEnrollment.is_enrolled(request.user, location.course_key): - return False - - return True - - def load_asset_from_location(self, location): - """ - Loads an asset based on its location, either retrieving it from a cache - or loading it directly from the contentstore. - """ - - # See if we can load this item from cache. - content = get_cached_content(location) - if content is None: - # Not in cache, so just try and load it from the asset manager. - try: - content = AssetManager.find(location, as_stream=True) - except (ItemNotFoundError, NotFoundError): # lint-amnesty, pylint: disable=try-except-raise - raise - - # Now that we fetched it, let's go ahead and try to cache it. We cap this at 1MB - # because it's the default for memcached and also we don't want to do too much - # buffering in memory when we're serving an actual request. - if content.length is not None and content.length < 1048576: - content = content.copy_to_in_mem() - set_cached_content(content) - - return content - - -IMPL = StaticContentServer() - - -def parse_range_header(header_value, content_length): - """ - Returns the unit and a list of (start, end) tuples of ranges. - - Raises ValueError if header is syntactically invalid or does not contain a range. - - See spec for details: http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35 - """ - - unit = None - ranges = [] - - if '=' in header_value: - unit, byte_ranges_string = header_value.split('=') - - # Parse the byte ranges. - for byte_range_string in byte_ranges_string.split(','): - byte_range_string = byte_range_string.strip() - # Case 0: - if '-' not in byte_range_string: # Invalid syntax of header value. # lint-amnesty, pylint: disable=no-else-raise - raise ValueError('Invalid syntax.') - # Case 1: -500 - elif byte_range_string.startswith('-'): - first = max(0, (content_length + int(byte_range_string))) - last = content_length - 1 - # Case 2: 500- - elif byte_range_string.endswith('-'): - first = int(byte_range_string[0:-1]) - last = content_length - 1 - # Case 3: 500-999 - else: - first, last = byte_range_string.split('-') - first = int(first) - last = min(int(last), content_length - 1) - - ranges.append((first, last)) - - if len(ranges) == 0: - raise ValueError('Invalid syntax') - - return unit, ranges diff --git a/openedx/core/djangoapps/contentserver/test/test_contentserver.py b/openedx/core/djangoapps/contentserver/test/test_contentserver.py index e9ee1f7ee5..e1c5e01c0a 100644 --- a/openedx/core/djangoapps/contentserver/test/test_contentserver.py +++ b/openedx/core/djangoapps/contentserver/test/test_contentserver.py @@ -4,7 +4,6 @@ Tests for StaticContentServer import copy - import datetime import logging import unittest @@ -17,18 +16,18 @@ from django.test import RequestFactory from django.test.client import Client from django.test.utils import override_settings from opaque_keys import InvalidKeyError -from xmodule.contentstore.django import contentstore -from xmodule.contentstore.content import StaticContent, VERSIONED_ASSETS_PREFIX -from xmodule.modulestore.django import modulestore -from xmodule.modulestore.tests.django_utils import TEST_DATA_SPLIT_MODULESTORE, SharedModuleStoreTestCase -from xmodule.modulestore.xml_importer import import_course_from_xml -from xmodule.assetstore.assetmgr import AssetManager -from xmodule.modulestore.exceptions import ItemNotFoundError from common.djangoapps.student.models import CourseEnrollment -from common.djangoapps.student.tests.factories import UserFactory, AdminFactory +from common.djangoapps.student.tests.factories import AdminFactory, UserFactory +from xmodule.assetstore.assetmgr import AssetManager +from xmodule.contentstore.content import VERSIONED_ASSETS_PREFIX, StaticContent +from xmodule.contentstore.django import contentstore +from xmodule.modulestore.django import modulestore +from xmodule.modulestore.exceptions import ItemNotFoundError +from xmodule.modulestore.tests.django_utils import TEST_DATA_SPLIT_MODULESTORE, SharedModuleStoreTestCase +from xmodule.modulestore.xml_importer import import_course_from_xml -from ..middleware import parse_range_header, HTTP_DATE_FORMAT, StaticContentServer +from ..views import HTTP_DATE_FORMAT, StaticContentServer, parse_range_header log = logging.getLogger(__name__) diff --git a/openedx/core/djangoapps/contentserver/views.py b/openedx/core/djangoapps/contentserver/views.py index 84bd4c3b10..7c40026415 100644 --- a/openedx/core/djangoapps/contentserver/views.py +++ b/openedx/core/djangoapps/contentserver/views.py @@ -9,28 +9,41 @@ Django what view handled the request, it does so by looking at the result of the `resolve` utility, but these URLs get a Resolver404 (because there's no registered urlpattern). -We'd like to turn this into a proper view: -https://github.com/openedx/edx-platform/issues/34702 +We've turned it into a proper view, with a few warts remaining: -The first step, seen here, is to have urlpatterns (redundant with the -middleware's `is_asset_request` method) and a view, but the view just calls into -the same code the middleware uses. The implementation of the middleware has been -moved into StaticContentServerImpl, leaving the middleware as just a shell -around the latter. - -A waffle flag chooses whether to allow the middleware to handle the request, or -whether to pass the request along to the view. Why? Because we might be relying -by accident on some weird behavior inherent to misusing a middleware this way, -and we need a way to quickly switch back if we encounter problems. - -If the view works, we can move all of StaticContentServerImpl directly into the -view and drop the middleware and the waffle flag. +- The view implementation is all bundled into a StaticContentServer class that + doesn't appear to have any state. The methods could likely just be extracted + as top-level functions. +- All three urlpatterns are registered to the same view, which then has to + re-parse the URL to determine which pattern is in effect. We should probably + have 3 views as entry points. """ -from django.http import HttpResponseNotFound +import datetime +import logging + +from django.http import ( + HttpResponse, + HttpResponseBadRequest, + HttpResponseForbidden, + HttpResponseNotFound, + HttpResponseNotModified, + HttpResponsePermanentRedirect +) from django.views.decorators.http import require_safe from edx_django_utils.monitoring import set_custom_attribute +from opaque_keys import InvalidKeyError +from opaque_keys.edx.locator import AssetLocator -from .middleware import CONTENT_SERVER_USE_VIEW, IMPL +from common.djangoapps.student.models import CourseEnrollment +from openedx.core.djangoapps.header_control import force_header_for_response +from xmodule.assetstore.assetmgr import AssetManager +from xmodule.contentstore.content import XASSET_LOCATION_TAG, StaticContent +from xmodule.exceptions import NotFoundError +from xmodule.modulestore import InvalidLocationError +from xmodule.modulestore.exceptions import ItemNotFoundError + +from .caching import get_cached_content, set_cached_content +from .models import CdnUserAgentsConfig, CourseAssetCacheTtlConfig @require_safe @@ -38,21 +51,315 @@ def course_assets_view(request): """ Serve course assets to end users. Colloquially referred to as "contentserver." """ - set_custom_attribute('content_server.handled_by.view', True) + return IMPL.process_request(request) - if not CONTENT_SERVER_USE_VIEW.is_enabled(): - # Should never happen; keep track of occurrences. - set_custom_attribute('content_server.view.called_when_disabled', True) - # But handle the request anyhow. - # We'll delegate request handling to an instance of the middleware - # until we can verify that the behavior is identical when requests - # come all the way through to the view. - response = IMPL.process_request(request) +log = logging.getLogger(__name__) - if response is None: - # Shouldn't happen - set_custom_attribute('content_server.view.no_response_from_impl', True) - return HttpResponseNotFound() - else: - return response +# TODO: Soon as we have a reasonable way to serialize/deserialize AssetKeys, we need +# to change this file so instead of using course_id_partial, we're just using asset keys + +HTTP_DATE_FORMAT = "%a, %d %b %Y %H:%M:%S GMT" + + +class StaticContentServer(): + """ + Serves course assets to end users. Colloquially referred to as "contentserver." + """ + def is_asset_request(self, request): + """Determines whether the given request is an asset request""" + # Don't change this without updating urls.py! See docstring of views.py. + return ( + request.path.startswith('/' + XASSET_LOCATION_TAG + '/') + or + request.path.startswith('/' + AssetLocator.CANONICAL_NAMESPACE) + or + StaticContent.is_versioned_asset_path(request.path) + ) + + # pylint: disable=too-many-statements + def process_request(self, request): + """Process the given request""" + asset_path = request.path + + if self.is_asset_request(request): # lint-amnesty, pylint: disable=too-many-nested-blocks + # Make sure we can convert this request into a location. + if AssetLocator.CANONICAL_NAMESPACE in asset_path: + asset_path = asset_path.replace('block/', 'block@', 1) + + # If this is a versioned request, pull out the digest and chop off the prefix. + requested_digest = None + if StaticContent.is_versioned_asset_path(asset_path): + requested_digest, asset_path = StaticContent.parse_versioned_asset_path(asset_path) + + # Make sure we have a valid location value for this asset. + try: + loc = StaticContent.get_location_from_path(asset_path) + except (InvalidLocationError, InvalidKeyError): + return HttpResponseBadRequest() + + # Attempt to load the asset to make sure it exists, and grab the asset digest + # if we're able to load it. + actual_digest = None + try: + content = self.load_asset_from_location(loc) + actual_digest = getattr(content, "content_digest", None) + except (ItemNotFoundError, NotFoundError): + return HttpResponseNotFound() + + # If this was a versioned asset, and the digest doesn't match, redirect + # them to the actual version. + if requested_digest is not None and actual_digest is not None and (actual_digest != requested_digest): + actual_asset_path = StaticContent.add_version_to_asset_path(asset_path, actual_digest) + return HttpResponsePermanentRedirect(actual_asset_path) + + # Set the basics for this request. Make sure that the course key for this + # asset has a run, which old-style courses do not. Otherwise, this will + # explode when the key is serialized to be sent to NR. + safe_course_key = loc.course_key + if safe_course_key.run is None: + safe_course_key = safe_course_key.replace(run='only') + + set_custom_attribute('course_id', safe_course_key) + set_custom_attribute('org', loc.org) + set_custom_attribute('contentserver.path', loc.path) + + # Figure out if this is a CDN using us as the origin. + is_from_cdn = StaticContentServer.is_cdn_request(request) + set_custom_attribute('contentserver.from_cdn', is_from_cdn) + + # Check if this content is locked or not. + locked = self.is_content_locked(content) + set_custom_attribute('contentserver.locked', locked) + + # Check that user has access to the content. + if not self.is_user_authorized(request, content, loc): + return HttpResponseForbidden('Unauthorized') + + # Figure out if the client sent us a conditional request, and let them know + # if this asset has changed since then. + last_modified_at_str = content.last_modified_at.strftime(HTTP_DATE_FORMAT) + if 'HTTP_IF_MODIFIED_SINCE' in request.META: + if_modified_since = request.META['HTTP_IF_MODIFIED_SINCE'] + if if_modified_since == last_modified_at_str: + return HttpResponseNotModified() + + # *** File streaming within a byte range *** + # If a Range is provided, parse Range attribute of the request + # Add Content-Range in the response if Range is structurally correct + # Request -> Range attribute structure: "Range: bytes=first-[last]" + # Response -> Content-Range attribute structure: "Content-Range: bytes first-last/totalLength" + # http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35 + response = None + if request.META.get('HTTP_RANGE'): + # If we have a StaticContent, get a StaticContentStream. Can't manipulate the bytes otherwise. + if isinstance(content, StaticContent): + content = AssetManager.find(loc, as_stream=True) + + header_value = request.META['HTTP_RANGE'] + try: + unit, ranges = parse_range_header(header_value, content.length) + except ValueError as exception: + # If the header field is syntactically invalid it should be ignored. + log.exception( + "%s in Range header: %s for content: %s", + str(exception), header_value, str(loc) + ) + else: + if unit != 'bytes': + # Only accept ranges in bytes + log.warning("Unknown unit in Range header: %s for content: %s", header_value, str(loc)) + elif len(ranges) > 1: + # According to Http/1.1 spec content for multiple ranges should be sent as a multipart message. + # http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.16 + # But we send back the full content. + log.warning( + "More than 1 ranges in Range header: %s for content: %s", header_value, str(loc) + ) + else: + first, last = ranges[0] + + if 0 <= first <= last < content.length: + # If the byte range is satisfiable + response = HttpResponse(content.stream_data_in_range(first, last)) + response['Content-Range'] = 'bytes {first}-{last}/{length}'.format( + first=first, last=last, length=content.length + ) + response['Content-Length'] = str(last - first + 1) + response.status_code = 206 # Partial Content + + set_custom_attribute('contentserver.ranged', True) + else: + log.warning( + "Cannot satisfy ranges in Range header: %s for content: %s", + header_value, str(loc) + ) + return HttpResponse(status=416) # Requested Range Not Satisfiable + + # If Range header is absent or syntactically invalid return a full content response. + if response is None: + response = HttpResponse(content.stream_data()) + response['Content-Length'] = content.length + + set_custom_attribute('contentserver.content_len', content.length) + set_custom_attribute('contentserver.content_type', content.content_type) + + # "Accept-Ranges: bytes" tells the user that only "bytes" ranges are allowed + response['Accept-Ranges'] = 'bytes' + response['Content-Type'] = content.content_type + response['X-Frame-Options'] = 'ALLOW' + + # Set any caching headers, and do any response cleanup needed. Based on how much + # middleware we have in place, there's no easy way to use the built-in Django + # utilities and properly sanitize and modify a response to ensure that it is as + # cacheable as possible, which is why we do it ourselves. + self.set_caching_headers(content, response) + + return response + + def set_caching_headers(self, content, response): + """ + Sets caching headers based on whether or not the asset is locked. + """ + + is_locked = getattr(content, "locked", False) + + # We want to signal to the end user's browser, and to any intermediate proxies/caches, + # whether or not this asset is cacheable. If we have a TTL configured, we inform the + # caller, for unlocked assets, how long they are allowed to cache it. Since locked + # assets should be restricted to enrolled students, we simply send headers that + # indicate there should be no caching whatsoever. + cache_ttl = CourseAssetCacheTtlConfig.get_cache_ttl() + if cache_ttl > 0 and not is_locked: + set_custom_attribute('contentserver.cacheable', True) + + response['Expires'] = StaticContentServer.get_expiration_value(datetime.datetime.utcnow(), cache_ttl) + response['Cache-Control'] = "public, max-age={ttl}, s-maxage={ttl}".format(ttl=cache_ttl) + elif is_locked: + set_custom_attribute('contentserver.cacheable', False) + + response['Cache-Control'] = "private, no-cache, no-store" + + response['Last-Modified'] = content.last_modified_at.strftime(HTTP_DATE_FORMAT) + + # Force the Vary header to only vary responses on Origin, so that XHR and browser requests get cached + # separately and don't screw over one another. i.e. a browser request that doesn't send Origin, and + # caches a version of the response without CORS headers, in turn breaking XHR requests. + force_header_for_response(response, 'Vary', 'Origin') + + @staticmethod + def is_cdn_request(request): + """ + Attempts to determine whether or not the given request is coming from a CDN. + + Currently, this is a static check because edx.org only uses CloudFront, but may + be expanded in the future. + """ + cdn_user_agents = CdnUserAgentsConfig.get_cdn_user_agents() + user_agent = request.META.get('HTTP_USER_AGENT', '') + if user_agent in cdn_user_agents: + # This is a CDN request. + return True + + return False + + @staticmethod + def get_expiration_value(now, cache_ttl): + """Generates an RFC1123 datetime string based on a future offset.""" + expire_dt = now + datetime.timedelta(seconds=cache_ttl) + return expire_dt.strftime(HTTP_DATE_FORMAT) + + def is_content_locked(self, content): + """ + Determines whether or not the given content is locked. + """ + return bool(getattr(content, "locked", False)) + + def is_user_authorized(self, request, content, location): + """ + Determines whether or not the user for this request is authorized to view the given asset. + """ + if not self.is_content_locked(content): + return True + + if not hasattr(request, "user") or not request.user.is_authenticated: + return False + + if not request.user.is_staff: + deprecated = getattr(location, 'deprecated', False) + if deprecated and not CourseEnrollment.is_enrolled_by_partial(request.user, location.course_key): + return False + if not deprecated and not CourseEnrollment.is_enrolled(request.user, location.course_key): + return False + + return True + + def load_asset_from_location(self, location): + """ + Loads an asset based on its location, either retrieving it from a cache + or loading it directly from the contentstore. + """ + + # See if we can load this item from cache. + content = get_cached_content(location) + if content is None: + # Not in cache, so just try and load it from the asset manager. + try: + content = AssetManager.find(location, as_stream=True) + except (ItemNotFoundError, NotFoundError): # lint-amnesty, pylint: disable=try-except-raise + raise + + # Now that we fetched it, let's go ahead and try to cache it. We cap this at 1MB + # because it's the default for memcached and also we don't want to do too much + # buffering in memory when we're serving an actual request. + if content.length is not None and content.length < 1048576: + content = content.copy_to_in_mem() + set_cached_content(content) + + return content + + +IMPL = StaticContentServer() + + +def parse_range_header(header_value, content_length): + """ + Returns the unit and a list of (start, end) tuples of ranges. + + Raises ValueError if header is syntactically invalid or does not contain a range. + + See spec for details: http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35 + """ + + unit = None + ranges = [] + + if '=' in header_value: + unit, byte_ranges_string = header_value.split('=') + + # Parse the byte ranges. + for byte_range_string in byte_ranges_string.split(','): + byte_range_string = byte_range_string.strip() + # Case 0: + if '-' not in byte_range_string: # Invalid syntax of header value. # lint-amnesty, pylint: disable=no-else-raise + raise ValueError('Invalid syntax.') + # Case 1: -500 + elif byte_range_string.startswith('-'): + first = max(0, (content_length + int(byte_range_string))) + last = content_length - 1 + # Case 2: 500- + elif byte_range_string.endswith('-'): + first = int(byte_range_string[0:-1]) + last = content_length - 1 + # Case 3: 500-999 + else: + first, last = byte_range_string.split('-') + first = int(first) + last = min(int(last), content_length - 1) + + ranges.append((first, last)) + + if len(ranges) == 0: + raise ValueError('Invalid syntax') + + return unit, ranges