edx-platform/openedx/features/effort_estimation/block_transformers.py

"""
Effort Estimation Transformer implementation.
Adds effort estimations for block types it recognizes.
"""

import math

import crum
import lxml.html
from django.utils.functional import cached_property
from edxval.api import get_videos_for_course

from openedx.core.djangoapps.content.block_structure.transformer import BlockStructureTransformer
from openedx.core.lib.mobile_utils import is_request_from_mobile_app

from .toggles import EFFORT_ESTIMATION_DISABLED_FLAG


class EffortEstimationTransformer(BlockStructureTransformer):
    """
    A transformer that adds effort estimation to the block tree.

    There are two fields added by this transformer:
    - effort_activities: The number of "activities" at this block or lower. Note that verticals count as a single
                         activity at most. Activities are basically anything that isn't text or video.
    - effort_time: Our best guess at how long the block and lower will take, in seconds. We use an estimated reading
                   speed and video duration to calculate this. Just a rough guide.

    If there is any missing data (like no video duration), we don't provide any estimates at all for the course.
    We'd rather provide no estimate than a misleading estimate.

    This transformer requires data gathered during the collection phase (from a course publish), so it won't work
    on a course until the next publish.
    """
    WRITE_VERSION = 1
    READ_VERSION = 1

    # Public xblock field names
    EFFORT_ACTIVITIES = 'effort_activities'
    EFFORT_TIME = 'effort_time'

    # Private transformer field names
    DISABLE_ESTIMATION = 'disable_estimation'
    HTML_WORD_COUNT = 'html_word_count'
    VIDEO_CLIP_DURATION = 'video_clip_duration'
    VIDEO_DURATION = 'video_duration'

    CACHE_VIDEO_DURATIONS = 'video.durations'
    DEFAULT_WPM = 265  # words per minute

    class MissingEstimationData(Exception):
        pass

    @classmethod
    def name(cls):
        """Unique identifier for the transformer's class; same identifier used in setup.py."""
        return 'effort_estimation'

    @classmethod
    def collect(cls, block_structure):
        """
        Grabs raw estimates for leaf content.

        Pooling leaf estimates higher up the tree (e.g. in verticals, then sequentials, then chapters) is done by
        transform() below at run time, because which blocks each user sees can be different.
        """
        block_structure.request_xblock_fields('category')
        block_structure.request_xblock_fields('global_speed', 'only_on_web')  # video fields

        collection_cache = {}  # collection methods can stuff some temporary data here

        collections = {
            'html': cls._collect_html_effort,
            'video': cls._collect_video_effort,
        }

        try:
            for block_key in block_structure.topological_traversal():
                xblock = block_structure.get_xblock(block_key)

                if xblock.category in collections:
                    collections[xblock.category](block_structure, block_key, xblock, collection_cache)

        except cls.MissingEstimationData:
            # Some bit of required data is missing. Likely some duration info is missing from the video pipeline.
            # Rather than attempt to work around it, just set a note for ourselves to not show durations for this
            # course at all. Better no estimate than a misleading estimate.
            block_structure.set_transformer_data(cls, cls.DISABLE_ESTIMATION, True)

    @classmethod
    def _collect_html_effort(cls, block_structure, block_key, xblock, _cache):
        """Records a word count for later reading speed calculations."""
        try:
            text = lxml.html.fromstring(xblock.data).text_content() if xblock.data else ''
        except Exception as exc:  # pylint: disable=broad-except
            raise cls.MissingEstimationData() from exc

        block_structure.set_transformer_block_field(block_key, cls, cls.HTML_WORD_COUNT, len(text.split()))

    @classmethod
    def _collect_video_effort(cls, block_structure, block_key, xblock, cache):
        """Records a duration for later viewing speed calculations."""
        # Lookup all course video metadata at once rather than piecemeal, for performance reasons
        if cls.CACHE_VIDEO_DURATIONS not in cache:
            all_videos, _ = get_videos_for_course(str(block_structure.root_block_usage_key.course_key))
            cache[cls.CACHE_VIDEO_DURATIONS] = {v['edx_video_id']: v['duration'] for v in all_videos}

        # Check if we have a duration. If not, raise an exception that will stop this transformer from affecting
        # this course.
        duration = cache[cls.CACHE_VIDEO_DURATIONS].get(xblock.edx_video_id, 0)
        if duration <= 0:
            raise cls.MissingEstimationData()

        block_structure.set_transformer_block_field(block_key, cls, cls.VIDEO_DURATION, duration)

        # Some videos will suggest specific start & end times, rather than the whole video. Note that this is only
        # supported in some clients (other clients - like the mobile app - will play the whole video anyway). So we
        # record this duration separately, to use instead of the whole video duration if the client supports it.
        clip_duration = (xblock.end_time - xblock.start_time).total_seconds()
        if clip_duration > 0:
            block_structure.set_transformer_block_field(block_key, cls, cls.VIDEO_CLIP_DURATION, clip_duration)

    def transform(self, usage_info, block_structure):
        # Early exit if our per-course opt-out flag is enabled
        if EFFORT_ESTIMATION_DISABLED_FLAG.is_enabled(block_structure.root_block_usage_key.course_key):
            return

        # Skip any transformation if our collection phase said to
        cls = EffortEstimationTransformer
        if block_structure.get_transformer_data(cls, cls.DISABLE_ESTIMATION, default=False):
            return

        # These estimation methods should return a tuple of (a number in seconds, an activity count)
        estimations = {
            'chapter': self._estimate_children_effort,
            'course': self._estimate_children_effort,
            'html': self._estimate_html_effort,
            'sequential': self._estimate_children_effort,
            'vertical': self._estimate_vertical_effort,
            'video': self._estimate_video_effort,
        }

        # We're good to continue and make user-specific estimates based on collected data
        for block_key in block_structure.post_order_traversal():
            category = block_structure.get_xblock_field(block_key, 'category')
            if category not in estimations:
                continue

            time, activities = estimations[category](usage_info, block_structure, block_key)

            if time is not None:
                # We take the ceiling of the estimate here just for cleanliness. Losing the fractional seconds does
                # technically make our estimate less accurate, especially as we combine these values in parents.
                # But easier to present a simple integer to any consumers, and precise to-the-second accuracy on our
                # estimate is not a primary goal.
                block_structure.override_xblock_field(block_key, self.EFFORT_TIME, math.ceil(time))

            if activities is not None:
                block_structure.override_xblock_field(block_key, self.EFFORT_ACTIVITIES, activities)

    @cached_property
    def _is_on_mobile(self):
        """Returns whether the current request is from our mobile app."""
        request = crum.get_current_request()
        return request and is_request_from_mobile_app(request)

    def _gather_child_values(self, block_structure, block_key, field, default=0):
        """Collects and sums all child values for field."""
        return sum(
            block_structure.get_xblock_field(child_key, field, default=default)
            for child_key in block_structure.get_children(block_key)
        )

    def _estimate_children_effort(self, _usage_info, block_structure, block_key):
        """Collects time and activity counts for children."""
        time = self._gather_child_values(block_structure, block_key, self.EFFORT_TIME)
        time = time or None  # avoid claiming anything takes 0 seconds by coercing to None (no estimate) instead

        # Use 1 as the default for activity - any block that we don't know for sure is 0, we should count
        activities = self._gather_child_values(block_structure, block_key, self.EFFORT_ACTIVITIES, default=1)

        return time, activities

    def _estimate_html_effort(self, _usage_info, block_structure, block_key):
        """Returns an average expected time to read the contained html."""
        cls = EffortEstimationTransformer
        word_count = block_structure.get_transformer_block_field(block_key, cls, self.HTML_WORD_COUNT)
        if not word_count:
            return None, 0

        time = word_count / self.DEFAULT_WPM * 60  # in seconds
        return time, 0

    def _estimate_vertical_effort(self, usage_info, block_structure, block_key):
        """A vertical is either an amount of time if we know it, or an activity"""
        time, activities = self._estimate_children_effort(usage_info, block_structure, block_key)

        # Verticals are the basic activity metric - we may have collected all unknown xblocks as activities in the call
        # above, but we reset that count to 1 here.
        return time, 1 if activities else 0

    def _estimate_video_effort(self, _usage_info, block_structure, block_key):
        """Returns an expected time to view the video, at the user's preferred speed."""
        cls = EffortEstimationTransformer
        clip_duration = block_structure.get_transformer_block_field(block_key, cls, self.VIDEO_CLIP_DURATION)
        duration = block_structure.get_transformer_block_field(block_key, cls, self.VIDEO_DURATION)
        global_speed = block_structure.get_xblock_field(block_key, 'global_speed', default=1)
        only_on_web = block_structure.get_xblock_field(block_key, 'only_on_web', default=False)

        if self._is_on_mobile:
            if only_on_web:
                return None, 0
            clip_duration = None  # mobile can't do clips

        user_duration = clip_duration or duration
        if not user_duration:
            return None, 0

        # We are intentionally only looking at global_speed, not speed (which is last speed user used on this video)
        # because this estimate is meant to be somewhat static.
        return user_duration / global_speed, 0