edx-platform/openedx/core/lib/request_utils.py

""" Utility functions related to HTTP requests """

import logging
import random
import re
from urllib.parse import urlparse

import crum
from django.conf import settings
from django.test.client import RequestFactory
from edx_django_utils.cache import RequestCache
from edx_django_utils.monitoring import set_custom_attribute
from opaque_keys import InvalidKeyError
from opaque_keys.edx.keys import CourseKey
from rest_framework.views import exception_handler

from openedx.core.djangoapps.site_configuration import helpers as configuration_helpers

# accommodates course api urls, excluding any course api routes that do not fall under v*/courses, such as v1/blocks.
COURSE_REGEX = re.compile(fr'^(.*?/course(s)?/)(?!v[0-9]+/[^/]+){settings.COURSE_ID_PATTERN}')

log = logging.getLogger(__name__)


def get_request_or_stub():
    """
    Return the current request or a stub request.

    If called outside the context of a request, construct a fake
    request that can be used to build an absolute URI.

    This is useful in cases where we need to pass in a request object
    but don't have an active request (for example, in tests, celery tasks, and XBlocks).
    """
    request = crum.get_current_request()

    if request is None:

        # The settings SITE_NAME may contain a port number, so we need to
        # parse the full URL.
        full_url = f"http://{settings.SITE_NAME}"
        parsed_url = urlparse(full_url)

        # Construct the fake request.  This can be used to construct absolute
        # URIs to other paths.
        return RequestFactory(
            SERVER_NAME=parsed_url.hostname,
            SERVER_PORT=parsed_url.port or 80,
        ).get("/")

    else:
        return request


def safe_get_host(request):
    """
    Get the host name for this request, as safely as possible.

    If ALLOWED_HOSTS is properly set, this calls request.get_host;
    otherwise, this returns whatever settings.SITE_NAME is set to.

    This ensures we will never accept an untrusted value of get_host()
    """
    if isinstance(settings.ALLOWED_HOSTS, (list, tuple)) and '*' not in settings.ALLOWED_HOSTS:
        return request.get_host()
    else:
        return configuration_helpers.get_value('site_domain', settings.SITE_NAME)


def course_id_from_url(url):
    """
    Extracts the course_id from the given `url`.
    """
    if not url:
        return None

    match = COURSE_REGEX.match(url)

    if match is None:
        return None

    course_id = match.group('course_id')

    if course_id is None:
        return None

    try:
        return CourseKey.from_string(course_id)
    except InvalidKeyError:
        return None


class CookieMonitoringMiddleware:
    """
    Middleware for monitoring the size and growth of all our cookies, to see if
    we're running into browser limits.
    """

    def __init__(self, get_response):
        self.get_response = get_response

    def __call__(self, request):
        response = self.get_response(request)
        # monitor after response to ensure logging can include user id where possible.
        try:
            self.log_and_monitor_cookies(request)
        except BaseException:
            log.exception("Unexpected error logging and monitoring cookies.")
        return response

    def log_and_monitor_cookies(self, request):
        """
        Add logging and custom attributes for monitoring cookie sizes.

        Don't log contents of cookies because that might cause a security issue.
        We just want to see if any cookies are growing out of control.

        Useful NRQL Queries:

            # Always available
            SELECT * FROM Transaction WHERE cookies.header.size > 6000

        Attributes that are added by this middleware:

            For all requests:

                cookies.header.size: The total size in bytes of the cookie header

            If COOKIE_HEADER_SIZE_LOGGING_THRESHOLD is reached:

                cookies.header.size.calculated

        Related Settings (see annotations for details):

            - COOKIE_HEADER_SIZE_LOGGING_THRESHOLD
            - COOKIE_SAMPLING_REQUEST_COUNT

        """

        raw_header_cookie = request.META.get('HTTP_COOKIE', '')
        cookie_header_size = len(raw_header_cookie.encode('utf-8'))
        # .. custom_attribute_name: cookies.header.size
        # .. custom_attribute_description: The total size in bytes of the cookie header.
        set_custom_attribute('cookies.header.size', cookie_header_size)

        # .. setting_name: COOKIE_HEADER_SIZE_LOGGING_THRESHOLD
        # .. setting_default: None
        # .. setting_description: The minimum size for the full cookie header to log a list of cookie names and sizes.
        #   Should be set to a relatively high threshold (suggested 9-10K) to avoid flooding the logs.
        logging_threshold = getattr(settings, "COOKIE_HEADER_SIZE_LOGGING_THRESHOLD", None)

        if not logging_threshold:
            return

        is_large_cookie_header_detected = cookie_header_size >= logging_threshold
        if not is_large_cookie_header_detected:
            # .. setting_name: COOKIE_SAMPLING_REQUEST_COUNT
            # .. setting_default: None
            # .. setting_description: This setting enables sampling cookie header logging for cookie headers smaller
            #   than COOKIE_HEADER_SIZE_LOGGING_THRESHOLD. The cookie header logging will happen randomly for each
            #   request with a chance of 1 in COOKIE_SAMPLING_REQUEST_COUNT. For example, to see approximately one
            #   sampled log message every 10 minutes, set COOKIE_SAMPLING_REQUEST_COUNT to the average number of
            #   requests in 10 minutes.
            # .. setting_warning: This setting requires COOKIE_HEADER_SIZE_LOGGING_THRESHOLD to be enabled to take
            #   effect.
            sampling_request_count = getattr(settings, "COOKIE_SAMPLING_REQUEST_COUNT", None)

            # if the cookie header size is lower than the threshold, skip logging unless configured to do
            #   random sampling and we choose the lucky number (in this case, 1).
            if not sampling_request_count or random.randint(1, sampling_request_count) > 1:
                return

        # Sort starting with largest cookies
        sorted_cookie_items = sorted(request.COOKIES.items(), key=lambda x: len(x[1]), reverse=True)
        sizes = ', '.join(f"{name}: {len(value)}" for (name, value) in sorted_cookie_items)
        if is_large_cookie_header_detected:
            log_prefix = f"Large (>= {logging_threshold}) cookie header detected."
        else:
            log_prefix = f"Sampled small (< {logging_threshold}) cookie header."
        log.info(f"{log_prefix} BEGIN-COOKIE-SIZES(total={cookie_header_size}) {sizes} END-COOKIE-SIZES")
        # The computed header size can be used to double check that there aren't large cookies that are
        #   duplicates in the original header (from different domains) that aren't being accounted for.
        cookies_header_size_computed = max(
            0, sum(len(name) + len(value) + 3 for (name, value) in request.COOKIES.items()) - 2
        )

        # .. custom_attribute_name: cookies.header.size.computed
        # .. custom_attribute_description: The computed total size in bytes of the cookie header, based on the
        #   cookies found in request.COOKIES. This value will only be captured for cookie headers larger than
        #   COOKIE_HEADER_SIZE_LOGGING_THRESHOLD. The value can be used to double check that there aren't large
        #   cookies that are duplicates in the cookie header (from different domains) that aren't being accounted
        #   for.
        set_custom_attribute('cookies.header.size.computed', cookies_header_size_computed)


def expected_error_exception_handler(exc, context):
    """
    Replacement for DRF's default exception handler to enable observing expected errors.

    In addition to the default behaviour, add logging and monitoring for expected errors.
    """
    # Call REST framework's default exception handler first to get the standard error response.
    response = exception_handler(exc, context)

    try:
        request = context['request'] if 'request' in context else None
    except TypeError:  # when context is not iterable
        request = None

    _log_and_monitor_expected_errors(request, exc, 'drf')
    return response


class ExpectedErrorMiddleware:
    """
    Middleware to add logging and monitoring for expected errors.
    """
    def __init__(self, get_response):
        self.get_response = get_response

    def __call__(self, request):
        response = self.get_response(request)
        return response

    def process_exception(self, request, exception):
        """
        Add logging and monitoring of expected errors.
        """
        _log_and_monitor_expected_errors(request, exception, 'middleware')


# .. setting_name: EXPECTED_ERRORS
# .. setting_default: None
# .. setting_description: Used to configure logging and monitoring for expected errors.
#     This setting is configured of a list of dicts. See setting and toggle annotations for
#     ``EXPECTED_ERRORS[N]['XXX']`` for details of each item in the dict.
#     If this setting is a non-empty list, all uncaught errors processed will get a ``checked_error_expected_from``
#     attribute, whether they are expected or not. Those errors that are processed and match a 'MODULE_AND_CLASS'
#     (documented elsewhere), will get an ``error_expected`` custom attribute. Unexpected errors would be errors with
#     ``error_expected IS NULL``. For additional diagnostic information for ignored errors, see the
#     EXPECTED_ERRORS[N]['IS_IGNORED'] annotation.
# .. setting_warning: We use Django Middleware and a DRF custom error handler to find uncaught errors. Some errors may
#     slip through the cracks, like ValidationError. Any error where ``checked_error_expected_from IS NULL`` is
#     an error that was not processed.

# .. setting_name: EXPECTED_ERRORS[N]['MODULE_AND_CLASS']
# .. setting_default: None
# .. setting_description: Required error module and class name that is expected. For example,
#     ``rest_framework.exceptions.PermissionDenied``.

# .. toggle_name: EXPECTED_ERRORS[N]['IS_IGNORED']
# .. toggle_implementation: DjangoSetting
# .. toggle_default: True
# .. toggle_description: Set this to False if the errors are not ignored by monitoring, but only expected, like
#      for temporary problems that may take some time to fix. If True, adds the custom attributes
#      ``error_ignored_class`` and ``error_ignored_message`` to help diagnose issues with ignored errors, since
#      this data is not otherwise available. For example of ignoring errors in New Relic, see:
#      https://docs.newrelic.com/docs/agents/manage-apm-agents/agent-data/manage-errors-apm-collect-ignore-or-mark-expected/#ignore  pylint: disable=line-too-long,useless-suppression
#      To query for ignored errors, you would use ``error_ignored_class IS NOT NULL``.
#      Note: This is defaulted to True because it will be easier for us to detect if True is not the correct value, by
#      seeing that these errors aren't actually ignored.
# .. toggle_warning: At this time, this toggle does not actually configure the error to be ignored. It is meant to match
#     the ignored error configuration found elsewhere. When monitoring, no errors should ever have the attribute
#     ``error_ignored_class``. Only Transactions should have this custom attribute. If found for an error, it means we
#     are stating an error should be ignored when it is not actually configured as such, or the configuration is not
#     working.
# .. toggle_use_cases: opt_out
# .. toggle_creation_date: 2021-03-11

# .. toggle_name: EXPECTED_ERRORS[N]['LOG_ERROR']
# .. toggle_implementation: DjangoSetting
# .. toggle_default: False
# .. toggle_description: If True, the error will be logged with a message like: "Expected error ...".
# .. toggle_use_cases: opt_in
# .. toggle_creation_date: 2021-03-11

# .. toggle_name: EXPECTED_ERRORS[N]['LOG_STACK_TRACE']
# .. toggle_implementation: DjangoSetting
# .. toggle_default: False
# .. toggle_description: If True, the stacktrace will be included with the logging message.
# .. toggle_warnings: Requires ``LOG_ERROR`` to be set to True, otherwise this value will be ignored.
# .. toggle_use_cases: opt_in
# .. toggle_creation_date: 2021-03-11

# .. setting_name: EXPECTED_ERRORS[N]['REASON_EXPECTED']
# .. setting_default: None
# .. setting_description: Required string explaining why the error is expected and/or ignored for documentation
#     purposes.


# Warning: do not access this directly, but instead use _get_expected_error_settings_dict.
# EXPECTED ERRORS Django setting is processed and stored as a dict keyed by ERROR_MODULE_AND_CLASS.
_EXPECTED_ERROR_SETTINGS_DICT = None


def _get_expected_error_settings_dict():
    """
    Returns a dict of dicts of expected error settings used for logging and monitoring.

    The contents of the EXPECTED_ERRORS Django Setting list is processed for efficient lookup by module.Class.

    Returns:
         (dict): dict of dicts, mapping module-and-class name to settings for proper handling of expected errors.
           Keys of the inner dicts use the lowercase version of the related Django Setting (e.g. 'REASON_EXPECTED' =>
           'reason_expected').

    Example return value::

        {
            'rest_framework.exceptions:PermissionDenied': {
                'is_ignored': True,
                'log_error': True,
                'log_stack_trace': True,
                'reason_expected': 'In most cases, signifies a user was trying to do something they cannot do. '
                   'However, an overabundance could indicate a bug, which could be monitored for.'
            }
            ...
        }

    """
    global _EXPECTED_ERROR_SETTINGS_DICT

    # Return cached processed mappings if already processed
    if _EXPECTED_ERROR_SETTINGS_DICT is not None:
        return _EXPECTED_ERROR_SETTINGS_DICT

    expected_errors = getattr(settings, 'EXPECTED_ERRORS', None)
    if expected_errors is None:
        _EXPECTED_ERROR_SETTINGS_DICT = {}
        return _EXPECTED_ERROR_SETTINGS_DICT

    # Use temporary variable to build mappings to avoid multi-threading issue with a partially
    # processed map.  Worst case, it is processed more than once at start-up.
    expected_error_settings_dict = {}

    try:
        for index, expected_error in enumerate(expected_errors):
            module_and_class = expected_error.get('MODULE_AND_CLASS')
            processed_expected_error = {
                'is_ignored': expected_error.get('IS_IGNORED', True),
                'log_error': expected_error.get('LOG_ERROR', False),
                'log_stack_trace': expected_error.get('LOG_STACK_TRACE', False),
                'reason_expected': expected_error.get('REASON_EXPECTED'),
            }

            # validate configuration
            if not isinstance(module_and_class, str):
                log.error(
                    "Skipping EXPECTED_ERRORS[%d] setting. 'MODULE_AND_CLASS' set to [%s] and should be module.Class, "
                    "like 'rest_framework.exceptions.PermissionDenied'.",
                    index, module_and_class
                )
                continue
            if ':' in module_and_class:
                log.warning(
                    "Replacing ':' with '.' in EXPECTED_ERRORS[%d]['MODULE_AND_CLASS'], which was set to %s. Note that "
                    "monitoring and logging will not include the ':'.",
                    index, module_and_class
                )
                module_and_class = module_and_class.replace(":", ".")
            if module_and_class in expected_error_settings_dict:
                log.warning(
                    "EXPECTED_ERRORS[%d] setting is overriding an earlier setting. 'MODULE_AND_CLASS' [%s] is defined "
                    "multiple times.",
                    index, module_and_class
                )
            if not processed_expected_error['reason_expected']:
                log.error(
                    "Skipping EXPECTED_ERRORS[%d] setting. 'REASON_EXPECTED' is required to document why %s is an "
                    "expected error.",
                    index, module_and_class
                )
                continue
            expected_error_settings_dict[module_and_class] = processed_expected_error
    except Exception as e:  # pylint: disable=broad-except
        set_custom_attribute('expected_errors_setting_misconfigured', repr(e))
        log.exception(f'Error processing setting EXPECTED_ERRORS. {repr(e)}')

    _EXPECTED_ERROR_SETTINGS_DICT = expected_error_settings_dict
    return _EXPECTED_ERROR_SETTINGS_DICT


def clear_cached_expected_error_settings():
    """
    Clears the cached expected error settings. Useful for testing.
    """
    global _EXPECTED_ERROR_SETTINGS_DICT
    _EXPECTED_ERROR_SETTINGS_DICT = None


def _log_and_monitor_expected_errors(request, exception, caller):
    """
    Adds logging and monitoring for expected errors as needed.

    Arguments:
        request: The request
        exception: The exception
        caller: Either 'middleware' or 'drf`
    """
    expected_error_settings_dict = _get_expected_error_settings_dict()
    if not expected_error_settings_dict:
        return

    # 'module.Class', for example, 'django.core.exceptions.PermissionDenied'
    # Note: `Exception` itself doesn't have a module.
    exception_module = getattr(exception, '__module__', '')
    separator = '.' if exception_module else ''
    module_and_class = f'{exception_module}{separator}{exception.__class__.__name__}'

    # Set checked_error_expected_from custom attribute to potentially help find issues where errors are never processed.
    set_custom_attribute('checked_error_expected_from', caller)

    # check if we already added logging/monitoring from a different caller
    request_cache = RequestCache('openedx.core.lib.request_utils')
    cached_handled_exception = request_cache.get_cached_response('handled_exception')
    if cached_handled_exception.is_found:
        cached_module_and_class = cached_handled_exception.value
        # exception was already processed by a different caller
        if cached_handled_exception.value == module_and_class:
            set_custom_attribute('checked_error_expected_from', 'multiple')
            return

        # We have confirmed using monitoring that it is very rare that middleware and drf handle different uncaught exceptions.
        # We will leave this attribute in place, but it is not worth investing in a workaround, especially given that
        # New Relic now offers its own expected error functionality, and this functionality may be simplified or removed.
        set_custom_attribute('unexpected_multiple_exceptions', cached_module_and_class)
        log.warning(
            "Unexpected scenario where different exceptions are handled by _log_and_monitor_expected_errors. "
            "See 'unexpected_multiple_exceptions' custom attribute. Skipping exception for %s.",
            module_and_class,
        )
        return
    request_cache.set('handled_exception', module_and_class)

    if module_and_class not in expected_error_settings_dict:
        return

    exception_message = str(exception)
    set_custom_attribute('error_expected', True)

    expected_error_settings = expected_error_settings_dict[module_and_class]
    if expected_error_settings['is_ignored']:
        # Additional error details are needed for ignored errors, because they are otherwise
        # not available by our monitoring system, because they have been ignored.
        set_custom_attribute('error_ignored_class', module_and_class)
        set_custom_attribute('error_ignored_message', exception_message)

    if expected_error_settings['log_error']:
        exc_info = exception if expected_error_settings['log_stack_trace'] else None
        request_path = getattr(request, 'path', 'request-path-unknown')
        log.info(
            'Expected error %s: %s: seen for path %s',
            module_and_class,
            exception_message,
            request_path,
            exc_info=exc_info,
        )