Merge pull request #2014 from edx/ormsbee/grade_distribution_fix

Fix (re-implement) answer distribution report generation.
2013-12-20 12:43:33 -08:00
parent 4cfe1e80ce 0565fbbfab
commit b1eff059a7
6 changed files with 285 additions and 80 deletions
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -5,6 +5,8 @@ These are notable changes in edx-platform.  This is a rolling list of changes,
 in roughly chronological order, most recent first.  Add your entries at or near
 the top.  Include a label indicating the component affected.

+LMS: Fix answer distribution download for small courses. LMS-922, LMS-811
+
 Blades: Add template for the zooming image in studio. BLD-206.

 Blades: Update behavior of start/end time fields. BLD-506.
--- a/common/lib/capa/capa/tests/response_xml_factory.py
+++ b/common/lib/capa/capa/tests/response_xml_factory.py
@@ -660,8 +660,8 @@ class OptionResponseXMLFactory(ResponseXMLFactory):

        # Set the "options" attribute
        # Format: "('first', 'second', 'third')"
-        options_attr_string = ",".join(["'%s'" % str(o) for o in options_list])
-        options_attr_string = "(%s)" % options_attr_string
+        options_attr_string = u",".join([u"'{}'".format(o) for o in options_list])
+        options_attr_string = u"({})".format(options_attr_string)
        optioninput_element.set('options', options_attr_string)

        # Set the "correct" attribute
--- a/lms/djangoapps/courseware/grades.py
+++ b/lms/djangoapps/courseware/grades.py
@@ -1,6 +1,7 @@
 # Compute grades using real division, with no integer truncation
 from __future__ import division
 from collections import defaultdict
+import json
 import random
 import logging

@@ -17,24 +18,15 @@ from courseware import courses
 from courseware.model_data import FieldDataCache
 from xblock.fields import Scope
 from xmodule import graders
-from xmodule.capa_module import CapaModule
 from xmodule.graders import Score
+from xmodule.modulestore.django import modulestore
+from xmodule.modulestore.exceptions import ItemNotFoundError
 from .models import StudentModule
 from .module_render import get_module, get_module_for_descriptor

 log = logging.getLogger("edx.courseware")


-def yield_module_descendents(module):
-    stack = module.get_display_items()
-    stack.reverse()
-
-    while len(stack) > 0:
-        next_module = stack.pop()
-        stack.extend(next_module.get_display_items())
-        yield next_module
-
-
 def yield_dynamic_descriptor_descendents(descriptor, module_creator):
    """
    This returns all of the descendants of a descriptor. If the descriptor
@@ -58,74 +50,101 @@ def yield_dynamic_descriptor_descendents(descriptor, module_creator):
        yield next_descriptor


-def yield_problems(request, course, student):
+def answer_distributions(course_id):
    """
-    Return an iterator over capa_modules that this student has
-    potentially answered.  (all that student has answered will definitely be in
-    the list, but there may be others as well).
+    Given a course_id, return answer distributions in the form of a dictionary
+    mapping:
+
+      (problem url_name, problem display_name, problem_id) -> {dict: answer -> count}
+
+    Answer distributions are found by iterating through all StudentModule
+    entries for a given course with type="problem" and a grade that is not null.
+    This means that we only count LoncapaProblems that people have submitted.
+    Other types of items like ORA or sequences will not be collected. Empty
+    Loncapa problem state that gets created from runnig the progress page is
+    also not counted.
+
+    This method accesses the StudentModule table directly instead of using the
+    CapaModule abstraction. The main reason for this is so that we can generate
+    the report without any side-effects -- we don't have to worry about answer
+    distribution potentially causing re-evaluation of the student answer. This
+    also allows us to use the read-replica database, which reduces risk of bad
+    locking behavior. And quite frankly, it makes this a lot less confusing.
+
+    Also, we're pulling all available records from the database for this course
+    rather than crawling through a student's course-tree -- the latter could
+    potentially cause us trouble with A/B testing. The distribution report may
+    not be aware of problems that are not visible to the user being used to
+    generate the report.
+
+    This method will try to use a read-replica database if one is available.
    """
-    grading_context = course.grading_context
+    # dict: { module.module_state_key : (url_name, display_name) }
+    state_keys_to_problem_info = {}  # For caching, used by url_and_display_name

-    descriptor_locations = (descriptor.location.url() for descriptor in grading_context['all_descriptors'])
-    existing_student_modules = set(StudentModule.objects.filter(
-        module_state_key__in=descriptor_locations
-    ).values_list('module_state_key', flat=True))
+    def url_and_display_name(module_state_key):
+        """
+        For a given module_state_key, return the problem's url and display_name.
+        Handle modulestore access and caching. This method ignores permissions.
+        May throw an ItemNotFoundError if there is no content that corresponds
+        to this module_state_key.
+        """
+        problem_store = modulestore()
+        if module_state_key not in state_keys_to_problem_info:
+            problems = problem_store.get_items(module_state_key, course_id=course_id, depth=1)
+            if not problems:
+                # Likely means that the problem was deleted from the course
+                # after the student had answered. We log this suspicion where
+                # this exception is caught.
+                raise ItemNotFoundError(
+                    "Answer Distribution: Module {} not found for course {}"
+                    .format(module_state_key, course_id)
+                )
+            problem = problems[0]
+            problem_info = (problem.url_name, problem.display_name_with_default)
+            state_keys_to_problem_info[module_state_key] = problem_info

-    sections_to_list = []
-    for _, sections in grading_context['graded_sections'].iteritems():
-        for section in sections:
+        return state_keys_to_problem_info[module_state_key]

-            section_descriptor = section['section_descriptor']
-
-            # If the student hasn't seen a single problem in the section, skip it.
-            for moduledescriptor in section['xmoduledescriptors']:
-                if moduledescriptor.location.url() in existing_student_modules:
-                    sections_to_list.append(section_descriptor)
-                    break
-
-    field_data_cache = FieldDataCache(sections_to_list, course.id, student)
-    for section_descriptor in sections_to_list:
-        section_module = get_module(student, request,
-                                    section_descriptor.location, field_data_cache,
-                                    course.id)
-        if section_module is None:
-            # student doesn't have access to this module, or something else
-            # went wrong.
-            # log.debug("couldn't get module for student {0} for section location {1}"
-            #           .format(student.username, section_descriptor.location))
+    # Iterate through all problems submitted for this course in no particular
+    # order, and build up our answer_counts dict that we will eventually return
+    answer_counts = defaultdict(lambda: defaultdict(int))
+    for module in StudentModule.all_submitted_problems_read_only(course_id):
+        try:
+            state_dict = json.loads(module.state) if module.state else {}
+            raw_answers = state_dict.get("student_answers", {})
+        except ValueError:
+            log.error(
+                "Answer Distribution: Could not parse module state for " +
+                "StudentModule id={}, course={}".format(module.id, course_id)
+            )
            continue

-        for problem in yield_module_descendents(section_module):
-            if isinstance(problem, CapaModule):
-                yield problem
+        # Each problem part has an ID that is derived from the
+        # module.module_state_key (with some suffix appended)
+        for problem_part_id, raw_answer in raw_answers.items():
+            # Convert whatever raw answers we have (numbers, unicode, None, etc.)
+            # to be unicode values. Note that if we get a string, it's always
+            # unicode and not str -- state comes from the json decoder, and that
+            # always returns unicode for strings.
+            answer = unicode(raw_answer)

+            try:
+                url, display_name = url_and_display_name(module.module_state_key)
+            except ItemNotFoundError:
+                msg = "Answer Distribution: Item {} referenced in StudentModule {} " + \
+                      "for user {} in course {} not found; " + \
+                      "This can happen if a student answered a question that " + \
+                      "was later deleted from the course. This answer will be " + \
+                      "omitted from the answer distribution CSV."
+                log.warning(
+                    msg.format(module.module_state_key, module.id, module.student_id, course_id)
+                )
+                continue

-def answer_distributions(request, course):
-    """
-    Given a course_descriptor, compute frequencies of answers for each problem:
-
-    Format is:
-
-    dict: (problem url_name, problem display_name, problem_id) -> (dict : answer ->  count)
-
-    TODO (vshnayder): this is currently doing a full linear pass through all
-    students and all problems.  This will be just a little slow.
-    """
-
-    counts = defaultdict(lambda: defaultdict(int))
-
-    enrolled_students = User.objects.filter(courseenrollment__course_id=course.id)
-
-    for student in enrolled_students:
-        for capa_module in yield_problems(request, course, student):
-            for problem_id in capa_module.lcp.student_answers:
-                # Answer can be a list or some other unhashable element.  Convert to string.
-                answer = str(capa_module.lcp.student_answers[problem_id])
-                key = (capa_module.url_name, capa_module.display_name_with_default, problem_id)
-                counts[key][answer] += 1
-
-    return counts
+            answer_counts[(url, display_name, problem_part_id)][answer] += 1

+    return answer_counts

@transaction.commit_manually
 def grade(student, request, course, keep_raw_scores=False):
--- a/lms/djangoapps/courseware/models.py
+++ b/lms/djangoapps/courseware/models.py
@@ -13,6 +13,7 @@ ASSUMPTIONS: modules have unique IDs, even across different module_types

 """
 from django.contrib.auth.models import User
+from django.conf import settings
 from django.db import models
 from django.db.models.signals import post_save
 from django.dispatch import receiver
@@ -57,6 +58,23 @@ class StudentModule(models.Model):
    created = models.DateTimeField(auto_now_add=True, db_index=True)
    modified = models.DateTimeField(auto_now=True, db_index=True)

+    @classmethod
+    def all_submitted_problems_read_only(cls, course_id):
+        """
+        Return all model instances that correspond to problems that have been
+        submitted for a given course. So module_type='problem' and a non-null
+        grade. Use a read replica if one exists for this environment.
+        """
+        queryset = cls.objects.filter(
+            course_id=course_id,
+            module_type='problem',
+            grade__isnull=False
+        )
+        if "read_replica" in settings.DATABASES:
+            return queryset.using("read_replica")
+        else:
+            return queryset
+
    def __repr__(self):
        return 'StudentModule<%r>' % ({
            'course_id': self.course_id,
--- a/lms/djangoapps/courseware/tests/test_submitting_problems.py
+++ b/lms/djangoapps/courseware/tests/test_submitting_problems.py
@@ -1,6 +1,8 @@
-"""Integration tests for submitting problem responses and getting grades."""
-
-# text processing dependancies
+# -*- coding: utf-8 -*-
+"""
+Integration tests for submitting problem responses and getting grades.
+"""
+# text processing dependencies
 import json
 import os
 from textwrap import dedent
@@ -16,6 +18,7 @@ from django.test.utils import override_settings
 # Need access to internal func to put users in the right group
 from courseware import grades
 from courseware.model_data import FieldDataCache
+from courseware.models import StudentModule

 from xmodule.modulestore.django import modulestore, editable_modulestore

@@ -29,6 +32,7 @@ from capa.tests.response_xml_factory import (
 from courseware.tests.helpers import LoginEnrollmentTestCase
 from courseware.tests.modulestore_config import TEST_DATA_MIXED_MODULESTORE
 from lms.lib.xblock.runtime import quote_slashes
+from student.tests.factories import UserFactory


@override_settings(MODULESTORE=TEST_DATA_MIXED_MODULESTORE)
@@ -141,7 +145,7 @@ class TestSubmittingProblems(ModuleStoreTestCase, LoginEnrollmentTestCase):
            question_text='The correct answer is Correct',
            num_inputs=num_inputs,
            weight=num_inputs,
-            options=['Correct', 'Incorrect'],
+            options=['Correct', 'Incorrect', u'ⓤⓝⓘⓒⓞⓓⓔ'],
            correct_option='Correct'
        )

@@ -852,3 +856,156 @@ class TestPythonGradedResponse(TestSubmittingProblems):
        name = 'computed_answer'
        self.computed_answer_setup(name)
        self._check_ireset(name)
+
+
+class TestAnswerDistributions(TestSubmittingProblems):
+    """Check that we can pull answer distributions for problems."""
+
+    def setUp(self):
+        """Set up a simple course with four problems."""
+        super(TestAnswerDistributions, self).setUp()
+
+        self.homework = self.add_graded_section_to_course('homework')
+        self.add_dropdown_to_section(self.homework.location, 'p1', 1)
+        self.add_dropdown_to_section(self.homework.location, 'p2', 1)
+        self.add_dropdown_to_section(self.homework.location, 'p3', 1)
+        self.refresh_course()
+
+    def test_empty(self):
+        # Just make sure we can process this without errors.
+        empty_distribution = grades.answer_distributions(self.course.id)
+        self.assertFalse(empty_distribution)  # should be empty
+
+    def test_one_student(self):
+        # Basic test to make sure we have simple behavior right for a student
+
+        # Throw in a non-ASCII answer
+        self.submit_question_answer('p1', {'2_1': u'ⓤⓝⓘⓒⓞⓓⓔ'})
+        self.submit_question_answer('p2', {'2_1': 'Correct'})
+
+        distributions = grades.answer_distributions(self.course.id)
+        self.assertEqual(
+            distributions,
+            {
+                ('p1', 'p1', 'i4x-MITx-100-problem-p1_2_1'): {
+                    u'ⓤⓝⓘⓒⓞⓓⓔ': 1
+                },
+                ('p2', 'p2', 'i4x-MITx-100-problem-p2_2_1'): {
+                    'Correct': 1
+                }
+            }
+        )
+
+    def test_multiple_students(self):
+        # Our test class is based around making requests for a particular user,
+        # so we're going to cheat by creating another user and copying and
+        # modifying StudentModule entries to make them from other users. It's
+        # a little hacky, but it seemed the simpler way to do this.
+        self.submit_question_answer('p1', {'2_1': u'Correct'})
+        self.submit_question_answer('p2', {'2_1': u'Incorrect'})
+        self.submit_question_answer('p3', {'2_1': u'Correct'})
+
+        # Make the above submissions owned by user2
+        user2 = UserFactory.create()
+        problems = StudentModule.objects.filter(
+            course_id=self.course.id,
+            student_id=self.student_user.id
+        )
+        for problem in problems:
+            problem.student_id = user2.id
+            problem.save()
+
+        # Now make more submissions by our original user
+        self.submit_question_answer('p1', {'2_1': u'Correct'})
+        self.submit_question_answer('p2', {'2_1': u'Correct'})
+
+        self.assertEqual(
+            grades.answer_distributions(self.course.id),
+            {
+                ('p1', 'p1', 'i4x-MITx-100-problem-p1_2_1'): {
+                    'Correct': 2
+                },
+                ('p2', 'p2', 'i4x-MITx-100-problem-p2_2_1'): {
+                    'Correct': 1,
+                    'Incorrect': 1
+                },
+                ('p3', 'p3', 'i4x-MITx-100-problem-p3_2_1'): {
+                    'Correct': 1
+                }
+            }
+        )
+
+    def test_other_data_types(self):
+        # We'll submit one problem, and then muck with the student_answers
+        # dict inside its state to try different data types (str, int, float,
+        # none)
+        self.submit_question_answer('p1', {'2_1': u'Correct'})
+
+        # Now fetch the state entry for that problem.
+        student_module = StudentModule.objects.get(
+            course_id=self.course.id,
+            student_id=self.student_user.id
+        )
+        for val in ('Correct', True, False, 0, 0.0, 1, 1.0, None):
+            state = json.loads(student_module.state)
+            state["student_answers"]['i4x-MITx-100-problem-p1_2_1'] = val
+            student_module.state = json.dumps(state)
+            student_module.save()
+
+            self.assertEqual(
+                grades.answer_distributions(self.course.id),
+                {
+                    ('p1', 'p1', 'i4x-MITx-100-problem-p1_2_1'): {
+                        str(val): 1
+                    },
+                }
+            )
+
+    def test_missing_content(self):
+        # If there's a StudentModule entry for content that no longer exists,
+        # we just quietly ignore it (because we can't display a meaningful url
+        # or name for it).
+        self.submit_question_answer('p1', {'2_1': 'Incorrect'})
+
+        # Now fetch the state entry for that problem and alter it so it points
+        # to a non-existent problem.
+        student_module = StudentModule.objects.get(
+            course_id=self.course.id,
+            student_id=self.student_user.id
+        )
+        student_module.module_state_key += "_fake"
+        student_module.save()
+
+        # It should be empty (ignored)
+        empty_distribution = grades.answer_distributions(self.course.id)
+        self.assertFalse(empty_distribution)  # should be empty
+
+    def test_broken_state(self):
+        # Missing or broken state for a problem should be skipped without
+        # causing the whole answer_distribution call to explode.
+
+        # Submit p1
+        self.submit_question_answer('p1', {'2_1': u'Correct'})
+
+        # Now fetch the StudentModule entry for p1 so we can corrupt its state
+        prb1 = StudentModule.objects.get(
+            course_id=self.course.id,
+            student_id=self.student_user.id
+        )
+
+        # Submit p2
+        self.submit_question_answer('p2', {'2_1': u'Incorrect'})
+
+        for new_p1_state in ('{"student_answers": {}}', "invalid json!", None):
+            prb1.state = new_p1_state
+            prb1.save()
+
+            # p1 won't show up, but p2 should still work
+            self.assertEqual(
+                grades.answer_distributions(self.course.id),
+                {
+                    ('p2', 'p2', 'i4x-MITx-100-problem-p2_2_1'): {
+                        'Incorrect': 1
+                    },
+                }
+            )
--- a/lms/djangoapps/instructor/views/legacy.py
+++ b/lms/djangoapps/instructor/views/legacy.py
@@ -140,7 +140,14 @@ def instructor_dashboard(request, course_id):
        writer.writerow(encoded_row)
        for datarow in datatable['data']:
            # 's' here may be an integer, float (eg score) or string (eg student name)
-            encoded_row = [unicode(s).encode('utf-8') for s in datarow]
+            encoded_row = [
+                # If s is already a UTF-8 string, trying to make a unicode
+                # object out of it will fail unless we pass in an encoding to
+                # the constructor. But we can't do that across the board,
+                # because s is often a numeric type. So just do this.
+                s if isinstance(s, str) else unicode(s).encode('utf-8')
+                for s in datarow
+            ]
            writer.writerow(encoded_row)
        return response

@@ -1492,14 +1499,16 @@ def get_answers_distribution(request, course_id):
    """
    course = get_course_with_access(request.user, course_id, 'staff')

-    dist = grades.answer_distributions(request, course)
+    dist = grades.answer_distributions(course.id)

    d = {}
    d['header'] = ['url_name', 'display name', 'answer id', 'answer', 'count']

-    d['data'] = [[url_name, display_name, answer_id, a, answers[a]]
-                 for (url_name, display_name, answer_id), answers in dist.items()
-                 for a in answers]
+    d['data'] = [
+        [url_name, display_name, answer_id, a, answers[a]]
+        for (url_name, display_name, answer_id), answers in sorted(dist.items())
+        for a in answers
+    ]
    return d