Incorporate changes in max_retry logic, adding subtask_status as bulk_email arg.

2013-09-26 17:38:01 -04:00
parent 08a08448ee
commit 7b7afd472d
4 changed files with 185 additions and 84 deletions
--- a/lms/djangoapps/bulk_email/tasks.py
+++ b/lms/djangoapps/bulk_email/tasks.py
@@ -36,8 +36,8 @@ from courseware.courses import get_course_by_id, course_image_url
 from instructor_task.models import InstructorTask
 from instructor_task.subtasks import (
    update_subtask_status,
-    create_subtask_result,
-    increment_subtask_result,
+    create_subtask_status,
+    increment_subtask_status,
    update_instructor_task_for_subtasks,
 )

@@ -54,7 +54,7 @@ QUOTA_EXCEEDED_ERRORS = (SESDailyQuotaExceededError, )
 # Errors that mail is being sent too quickly. When caught by a task, it
 # triggers an exponential backoff and retry. Retries happen continuously until
 # the email is sent.
-SENDING_RATE_ERRORS = (SESMaxSendingRateExceededError, )
+INFINITE_RETRY_ERRORS = (SESMaxSendingRateExceededError, )


 def _get_recipient_queryset(user_id, to_option, course_id, course_location):
@@ -120,6 +120,7 @@ def perform_delegate_email_batches(entry_id, course_id, task_input, action_name)
    # get inputs to use in this task from the entry:
    #task_id = entry.task_id
    user_id = entry.requester.id
+    task_id = entry.task_id

    # TODO: check this against argument passed in?
    # course_id = entry.course_id
@@ -132,7 +133,7 @@ def perform_delegate_email_batches(entry_id, course_id, task_input, action_name)
        # is submitted and reaches this point.  It is possible to add retry behavior here,
        # to keep trying until the object is actually committed by the view function's return,
        # but it's cleaner to just expect to be done.
-        log.warning("Failed to get CourseEmail with id %s", email_id)
+        log.warning("Task %s: Failed to get CourseEmail with id %s", task_id, email_id)
        raise

    to_option = email_obj.to_option
@@ -144,26 +145,25 @@ def perform_delegate_email_batches(entry_id, course_id, task_input, action_name)
    try:
        course = get_course_by_id(course_id, depth=1)
    except Http404 as exc:
-        log.exception("get_course_by_id failed: %s", exc.args[0])
+        log.exception("Task %s: get_course_by_id failed: %s", task_id, exc.args[0])
        raise Exception("get_course_by_id failed: " + exc.args[0])

    global_email_context = _get_course_email_context(course)
    recipient_qset = _get_recipient_queryset(user_id, to_option, course_id, course.location)
    total_num_emails = recipient_qset.count()

-    log.info("Preparing to queue emails to %d recipient(s) for course %s, email %s, to_option %s",
-             total_num_emails, course_id, email_id, to_option)
+    log.info("Task %s: Preparing to queue emails to %d recipient(s) for course %s, email %s, to_option %s",
+             task_id, total_num_emails, course_id, email_id, to_option)

    num_queries = int(math.ceil(float(total_num_emails) / float(settings.EMAILS_PER_QUERY)))
    last_pk = recipient_qset[0].pk - 1
-    num_workers = 0
+    num_emails_queued = 0
    task_list = []
    subtask_id_list = []
    for _ in range(num_queries):
        # Note that if we were doing this for regrading we probably only need 'pk', and not
        # either profile__name or email.  That's because we'll have to do
        # a lot more work in the individual regrade for each user, but using user_id as a key.
-        # TODO: figure out how to pass these values as an argument, when refactoring this code.
        recipient_sublist = list(recipient_qset.order_by('pk').filter(pk__gt=last_pk)
                                 .values('profile__name', 'email', 'pk')[:settings.EMAILS_PER_QUERY])
        last_pk = recipient_sublist[-1]['pk']
@@ -179,22 +179,32 @@ def perform_delegate_email_batches(entry_id, course_id, task_input, action_name)
                to_list = recipient_sublist[i * chunk:i * chunk + chunk]
            subtask_id = str(uuid4())
            subtask_id_list.append(subtask_id)
+            retry_progress = create_subtask_status()
            task_list.append(send_course_email.subtask((
                entry_id,
                email_id,
                to_list,
                global_email_context,
+                retry_progress,
            ),
            task_id=subtask_id,
            routing_key=settings.HIGH_PRIORITY_QUEUE,
            ))
-        num_workers += num_tasks_this_query
+        num_emails_queued += num_emails_this_query
+
+    # Sanity check: we expect the chunking to be properly summing to the original count:
+    if num_emails_queued != total_num_emails:
+        error_msg = "Task {}: number of emails generated by chunking {} not equal to original total {}".format(
+            task_id, num_emails_queued, total_num_emails
+        )
+        log.error(error_msg)
+        raise Exception(error_msg)

    # Update the InstructorTask  with information about the subtasks we've defined.
    progress = update_instructor_task_for_subtasks(entry, action_name, total_num_emails, subtask_id_list)
    num_subtasks = len(subtask_id_list)
-    log.info("Preparing to queue %d email tasks for course %s, email %s, to %s",
-             num_subtasks, course_id, email_id, to_option)
+    log.info("Preparing to queue %d email tasks (%d emails) for course %s, email %s, to %s",
+             num_subtasks, total_num_emails, course_id, email_id, to_option)

    # now group the subtasks, and start them running:
    task_group = group(task_list)
@@ -202,9 +212,9 @@ def perform_delegate_email_batches(entry_id, course_id, task_input, action_name)

    # We want to return progress here, as this is what will be stored in the
    # AsyncResult for the parent task as its return value.
-    # The AsyncResult will then be marked as SUCCEEDED, and have this return value as it's "result".
+    # The AsyncResult will then be marked as SUCCEEDED, and have this return value as its "result".
    # That's okay, for the InstructorTask will have the "real" status, and monitoring code
-    # will use that instead.
+    # should be using that instead.
    return progress


@@ -215,7 +225,7 @@ def _get_current_task():


@task(default_retry_delay=15, max_retries=5)  # pylint: disable=E1102
-def send_course_email(entry_id, email_id, to_list, global_email_context):
+def send_course_email(entry_id, email_id, to_list, global_email_context, subtask_status):
    """
    Sends an email to a list of recipients.

@@ -242,19 +252,20 @@ def send_course_email(entry_id, email_id, to_list, global_email_context):
    # Get information from current task's request:
    current_task_id = _get_current_task().request.id
    num_to_send = len(to_list)
-    log.info("Preparing to send %s emails as subtask %s for instructor task %d: request = %s",
-             num_to_send, current_task_id, entry_id, _get_current_task().request)
+    log.info("Preparing to send email %s to %d recipients as subtask %s for instructor task %d: context = %s, status=%s",
+             email_id, num_to_send, current_task_id, entry_id, global_email_context, subtask_status)

    send_exception = None
-    course_email_result_value = None
+    new_subtask_status = None
    try:
        course_title = global_email_context['course_title']
        with dog_stats_api.timer('course_email.single_task.time.overall', tags=[_statsd_tag(course_title)]):
-            course_email_result_value, send_exception = _send_course_email(
+            new_subtask_status, send_exception = _send_course_email(
                entry_id,
                email_id,
                to_list,
                global_email_context,
+                subtask_status,
            )
    except Exception:
        # Unexpected exception. Try to write out the failure to the entry before failing
@@ -264,28 +275,30 @@ def send_course_email(entry_id, email_id, to_list, global_email_context):
        # We got here for really unexpected reasons.  Since we don't know how far
        # the task got in emailing, we count all recipients as having failed.
        # It at least keeps the counts consistent.
-        course_email_result_value = create_subtask_result(0, num_to_send, 0)
+        new_subtask_status = increment_subtask_status(subtask_status, failed=num_to_send, state=FAILURE)
+        update_subtask_status(entry_id, current_task_id, new_subtask_status)
+        raise send_exception

    if send_exception is None:
        # Update the InstructorTask object that is storing its progress.
        log.info("background task (%s) succeeded", current_task_id)
-        update_subtask_status(entry_id, current_task_id, SUCCESS, course_email_result_value)
+        update_subtask_status(entry_id, current_task_id, new_subtask_status)
    elif isinstance(send_exception, RetryTaskError):
        # If retrying, record the progress made before the retry condition
        # was encountered.  Once the retry is running, it will be only processing
        # what wasn't already accomplished.
        log.warning("background task (%s) being retried", current_task_id)
-        update_subtask_status(entry_id, current_task_id, RETRY, course_email_result_value)
+        update_subtask_status(entry_id, current_task_id, new_subtask_status)
        raise send_exception
    else:
        log.error("background task (%s) failed: %s", current_task_id, send_exception)
-        update_subtask_status(entry_id, current_task_id, FAILURE, course_email_result_value)
+        update_subtask_status(entry_id, current_task_id, new_subtask_status)
        raise send_exception

-    return course_email_result_value
+    return new_subtask_status


-def _send_course_email(entry_id, email_id, to_list, global_email_context):
+def _send_course_email(entry_id, email_id, to_list, global_email_context, subtask_status):
    """
    Performs the email sending task.

@@ -312,6 +325,8 @@ def _send_course_email(entry_id, email_id, to_list, global_email_context):
        'skipped': number of emails skipped (due to optout)
        'failed': number of emails not sent because of some failure

+        The dict may also contain information about retries.
+
      * Second value is an exception returned by the innards of the method, indicating a fatal error.
        In this case, the number of recipients that were not sent have already been added to the
        'failed' count above.
@@ -319,6 +334,8 @@ def _send_course_email(entry_id, email_id, to_list, global_email_context):
    # Get information from current task's request:
    task_id = _get_current_task().request.id
    retry_index = _get_current_task().request.retries
+
+    # If this is a second attempt, then throttle the speed at which mail is sent:
    throttle = retry_index > 0

    num_optout = 0
@@ -400,6 +417,7 @@ def _send_course_email(entry_id, email_id, to_list, global_email_context):

                log.info('Email with id %s sent to %s', email_id, email)
                num_sent += 1
+
            except SMTPDataError as exc:
                # According to SMTP spec, we'll retry error codes in the 4xx range.  5xx range indicates hard failure.
                if exc.smtp_code >= 400 and exc.smtp_code < 500:
@@ -414,8 +432,15 @@ def _send_course_email(entry_id, email_id, to_list, global_email_context):
            # Pop the user that was emailed off the end of the list:
            to_list.pop()

-    except SENDING_RATE_ERRORS as exc:
-        subtask_progress = create_subtask_result(num_sent, num_error, num_optout)
+    except INFINITE_RETRY_ERRORS as exc:
+        subtask_progress = increment_subtask_status(
+            subtask_status,
+            succeeded=num_sent,
+            failed=num_error,
+            skipped=num_optout,
+            retriedA=1,
+            state=RETRY
+        )
        return _submit_for_retry(
            entry_id, email_id, to_list, global_email_context, exc, subtask_progress, True
        )
@@ -424,7 +449,14 @@ def _send_course_email(entry_id, email_id, to_list, global_email_context):
        # Errors caught here cause the email to be retried.  The entire task is actually retried
        # without popping the current recipient off of the existing list.
        # Errors caught are those that indicate a temporary condition that might succeed on retry.
-        subtask_progress = create_subtask_result(num_sent, num_error, num_optout)
+        subtask_progress = increment_subtask_status(
+            subtask_status,
+            succeeded=num_sent,
+            failed=num_error,
+            skipped=num_optout,
+            retriedB=1,
+            state=RETRY
+        )
        return _submit_for_retry(
            entry_id, email_id, to_list, global_email_context, exc, subtask_progress, False
        )
@@ -444,10 +476,24 @@ def _send_course_email(entry_id, email_id, to_list, global_email_context):
            log.exception('Task %s: email with id %d caused send_course_email task to fail with uncaught exception. To list: %s',
                          task_id, email_id, [i['email'] for i in to_list])
        num_error += len(to_list)
-        return create_subtask_result(num_sent, num_error, num_optout), exc
+        subtask_progress = increment_subtask_status(
+            subtask_status,
+            succeeded=num_sent,
+            failed=num_error,
+            skipped=num_optout,
+            state=FAILURE
+        )
+        return subtask_progress, exc
    else:
        # Successful completion is marked by an exception value of None:
-        return create_subtask_result(num_sent, num_error, num_optout), None
+        subtask_progress = increment_subtask_status(
+            subtask_status,
+            succeeded=num_sent,
+            failed=num_error,
+            skipped=num_optout,
+            state=SUCCESS
+        )
+        return subtask_progress, None
    finally:
        # clean up at the end
        connection.close()
@@ -476,16 +522,24 @@ def _submit_for_retry(entry_id, email_id, to_list, global_email_context, current
    task_id = _get_current_task().request.id
    retry_index = _get_current_task().request.retries

-    log.warning('Task %s: email with id %d not delivered due to temporary error %s, retrying send to %d recipients',
-                task_id, email_id, current_exception, len(to_list))
-
-    # Don't resend emails that have already succeeded.
-    # Retry the email at increasing exponential backoff.
+    log.info("Task %s: Successfully sent to %s users; failed to send to %s users (and skipped %s users)",
+             current_task.request.id, subtask_progress['succeeded'], subtask_progress['failed'], subtask_progress['skipped'])

+    # Calculate time until we retry this task (in seconds):
    if is_sending_rate_error:
-        countdown = ((2 ** retry_index) * 15) * random.uniform(.5, 1.5)
+        exp = min(retry_index, 5)
+        countdown = ((2 ** exp) * 15) * random.uniform(.5, 1.25)
+        exception_type = 'sending-rate'
    else:
        countdown = ((2 ** retry_index) * 15) * random.uniform(.75, 1.5)
+        exception_type = 'transient'
+
+    # max_retries is increased by the number of times an "infinite-retry" exception
+    # has been retried.  We want the regular retries to trigger a retry, but not these
+    # special retries.  So we count them separately.
+    max_retries = _get_current_task().max_retries + subtask_progress['retriedA']
+    log.warning('Task %s: email with id %d not delivered due to %s error %s, retrying send to %d recipients (with max_retry=%s)',
+                task_id, email_id, exception_type, current_exception, len(to_list), max_retries)

    try:
        send_course_email.retry(
@@ -494,9 +548,11 @@ def _submit_for_retry(entry_id, email_id, to_list, global_email_context, current
                email_id,
                to_list,
                global_email_context,
+                subtask_progress,
            ],
            exc=current_exception,
            countdown=countdown,
+            max_retries=max_retries,
            throw=True,
        )
    except RetryTaskError as retry_error:
@@ -513,7 +569,7 @@ def _submit_for_retry(entry_id, email_id, to_list, global_email_context, current
        log.exception('Task %s: email with id %d caused send_course_email task to fail to retry. To list: %s',
                      task_id, email_id, [i['email'] for i in to_list])
        num_failed = len(to_list)
-        new_subtask_progress = increment_subtask_result(subtask_progress, 0, num_failed, 0)
+        new_subtask_progress = increment_subtask_status(subtask_progress, failed=num_failed, state=FAILURE)
        return new_subtask_progress, retry_exc


--- a/lms/djangoapps/bulk_email/tests/test_email.py
+++ b/lms/djangoapps/bulk_email/tests/test_email.py
@@ -15,7 +15,7 @@ from student.tests.factories import UserFactory, GroupFactory, CourseEnrollmentF
 from xmodule.modulestore.tests.django_utils import ModuleStoreTestCase
 from xmodule.modulestore.tests.factories import CourseFactory
 from bulk_email.models import Optout
-from instructor_task.subtasks import create_subtask_result
+from instructor_task.subtasks import increment_subtask_status

 STAFF_COUNT = 3
 STUDENT_COUNT = 10
@@ -29,13 +29,13 @@ class MockCourseEmailResult(object):
    """
    emails_sent = 0

-    def get_mock_create_subtask_result(self):
+    def get_mock_increment_subtask_status(self):
        """Wrapper for mock email function."""
-        def mock_create_subtask_result(sent, failed, output, **kwargs):  # pylint: disable=W0613
+        def mock_increment_subtask_status(original_status, **kwargs):  # pylint: disable=W0613
            """Increments count of number of emails sent."""
-            self.emails_sent += sent
-            return create_subtask_result(sent, failed, output)
-        return mock_create_subtask_result
+            self.emails_sent += kwargs['succeeded']
+            return increment_subtask_status(original_status, **kwargs)
+        return mock_increment_subtask_status


@override_settings(MODULESTORE=TEST_DATA_MONGO_MODULESTORE)
@@ -244,13 +244,13 @@ class TestEmailSendFromDashboard(ModuleStoreTestCase):
        )

    @override_settings(EMAILS_PER_TASK=3, EMAILS_PER_QUERY=7)
-    @patch('bulk_email.tasks.create_subtask_result')
+    @patch('bulk_email.tasks.increment_subtask_status')
    def test_chunked_queries_send_numerous_emails(self, email_mock):
        """
        Test sending a large number of emails, to test the chunked querying
        """
        mock_factory = MockCourseEmailResult()
-        email_mock.side_effect = mock_factory.get_mock_create_subtask_result()
+        email_mock.side_effect = mock_factory.get_mock_increment_subtask_status()
        added_users = []
        for _ in xrange(LARGE_NUM_EMAILS):
            user = UserFactory()
--- a/lms/djangoapps/bulk_email/tests/test_err_handling.py
+++ b/lms/djangoapps/bulk_email/tests/test_err_handling.py
@@ -67,7 +67,7 @@ class TestEmailErrors(ModuleStoreTestCase):
        self.assertTrue(type(exc) == SMTPDataError)

    @patch('bulk_email.tasks.get_connection', autospec=True)
-    @patch('bulk_email.tasks.create_subtask_result')
+    @patch('bulk_email.tasks.increment_subtask_status')
    @patch('bulk_email.tasks.send_course_email.retry')
    def test_data_err_fail(self, retry, result, get_conn):
        """
@@ -91,11 +91,11 @@ class TestEmailErrors(ModuleStoreTestCase):
        # We shouldn't retry when hitting a 5xx error
        self.assertFalse(retry.called)
        # Test that after the rejected email, the rest still successfully send
-        ((sent, fail, optouts), _) = result.call_args
-        self.assertEquals(optouts, 0)
+        ((_initial_results), kwargs) = result.call_args
+        self.assertEquals(kwargs['skipped'], 0)
        expectedNumFails = int((settings.EMAILS_PER_TASK + 3) / 4.0)
-        self.assertEquals(fail, expectedNumFails)
-        self.assertEquals(sent, settings.EMAILS_PER_TASK - expectedNumFails)
+        self.assertEquals(kwargs['failed'], expectedNumFails)
+        self.assertEquals(kwargs['succeeded'], settings.EMAILS_PER_TASK - expectedNumFails)

    @patch('bulk_email.tasks.get_connection', autospec=True)
    @patch('bulk_email.tasks.send_course_email.retry')
@@ -138,7 +138,7 @@ class TestEmailErrors(ModuleStoreTestCase):
        exc = kwargs['exc']
        self.assertTrue(type(exc) == SMTPConnectError)

-    @patch('bulk_email.tasks.create_subtask_result')
+    @patch('bulk_email.tasks.increment_subtask_status')
    @patch('bulk_email.tasks.send_course_email.retry')
    @patch('bulk_email.tasks.log')
    @patch('bulk_email.tasks.get_connection', Mock(return_value=EmailTestException))
@@ -163,12 +163,13 @@ class TestEmailErrors(ModuleStoreTestCase):
        self.assertFalse(retry.called)
        # check the results being returned
        self.assertTrue(result.called)
-        ((sent, fail, optouts), _) = result.call_args
-        self.assertEquals(optouts, 0)
-        self.assertEquals(fail, 1)  # just myself
-        self.assertEquals(sent, 0)
+        ((initial_results, ), kwargs) = result.call_args
+        self.assertEquals(initial_results['skipped'], 0)
+        self.assertEquals(initial_results['failed'], 0)
+        self.assertEquals(initial_results['succeeded'], 0)
+        self.assertEquals(kwargs['failed'], 1)

-    @patch('bulk_email.tasks.create_subtask_result')
+    @patch('bulk_email.tasks.increment_subtask_status')
    @patch('bulk_email.tasks.log')
    def test_nonexist_email(self, mock_log, result):
        """
@@ -180,7 +181,7 @@ class TestEmailErrors(ModuleStoreTestCase):
        task_input = {"email_id": -1}
        with self.assertRaises(CourseEmail.DoesNotExist):
            perform_delegate_email_batches(entry.id, course_id, task_input, "action_name")
-        ((log_str, email_id), _) = mock_log.warning.call_args
+        ((log_str, _, email_id), _) = mock_log.warning.call_args
        self.assertTrue(mock_log.warning.called)
        self.assertIn('Failed to get CourseEmail with id', log_str)
        self.assertEqual(email_id, -1)
@@ -198,7 +199,7 @@ class TestEmailErrors(ModuleStoreTestCase):
        task_input = {"email_id": email.id}
        with self.assertRaises(Exception):
            perform_delegate_email_batches(entry.id, course_id, task_input, "action_name")
-        ((log_str, _), _) = mock_log.exception.call_args
+        ((log_str, _, _), _) = mock_log.exception.call_args
        self.assertTrue(mock_log.exception.called)
        self.assertIn('get_course_by_id failed:', log_str)

--- a/lms/djangoapps/instructor_task/subtasks.py
+++ b/lms/djangoapps/instructor_task/subtasks.py
@@ -5,7 +5,7 @@ from time import time
 import json

 from celery.utils.log import get_task_logger
-from celery.states import SUCCESS, RETRY
+from celery.states import SUCCESS, RETRY, READY_STATES

 from django.db import transaction

@@ -14,29 +14,51 @@ from instructor_task.models import InstructorTask, PROGRESS, QUEUING
 TASK_LOG = get_task_logger(__name__)


-def create_subtask_result(num_sent, num_error, num_optout):
+def create_subtask_status(succeeded=0, failed=0, pending=0, skipped=0, retriedA=0, retriedB=0, state=None):
    """
-    Create a result of a subtask.
+    Create a dict for tracking the status of a subtask.

-    Keys are:  'attempted', 'succeeded', 'skipped', 'failed'.
+    Keys are:  'attempted', 'succeeded', 'skipped', 'failed', 'retried'.
+TODO: update
+    Object must be JSON-serializable, so that it can be passed as an argument
+    to tasks.

-    Object must be JSON-serializable.
+    TODO: decide if in future we want to include specific error information
+    indicating the reason for failure.
+    Also, we should count up "not attempted" separately from
+    attempted/failed.
    """
-    attempted = num_sent + num_error
-    current_result = {'attempted': attempted, 'succeeded': num_sent, 'skipped': num_optout, 'failed': num_error}
+    attempted = succeeded + failed
+    current_result = {
+        'attempted': attempted,
+        'succeeded': succeeded,
+        'pending': pending,
+        'skipped': skipped,
+        'failed': failed,
+        'retriedA': retriedA,
+        'retriedB': retriedB,
+        'state': state if state is not None else QUEUING,
+    }
    return current_result


-def increment_subtask_result(subtask_result, new_num_sent, new_num_error, new_num_optout):
+def increment_subtask_status(subtask_result, succeeded=0, failed=0, pending=0, skipped=0, retriedA=0, retriedB=0, state=None):
    """
    Update the result of a subtask with additional results.

-    Keys are:  'attempted', 'succeeded', 'skipped', 'failed'.
+    Keys are:  'attempted', 'succeeded', 'skipped', 'failed', 'retried'.
    """
-    new_result = create_subtask_result(new_num_sent, new_num_error, new_num_optout)
+    # TODO: rewrite this if we have additional fields added to original subtask_result,
+    # that are not part of the increment.  Tradeoff on duplicating the 'attempts' logic.
+    new_result = create_subtask_status(succeeded, failed, pending, skipped, retriedA, retriedB, state)
    for keyname in new_result:
-        if keyname in subtask_result:
+        if keyname == 'state':
+            # does not get incremented.  If no new value, copy old value:
+            if state is None:
+                new_result[keyname] = subtask_result[keyname]
+        elif keyname in subtask_result:
            new_result[keyname] += subtask_result[keyname]
+
    return new_result


@@ -49,7 +71,7 @@ def update_instructor_task_for_subtasks(entry, action_name, total_num, subtask_i
    as is the 'duration_ms' value.  A 'start_time' is stored for later duration calculations,
    and the total number of "things to do" is set, so the user can be told how much needs to be
    done overall.  The `action_name` is also stored, to also help with constructing more readable
-    progress messages.
+    task_progress messages.

    The InstructorTask's "subtasks" field is also initialized.  This is also a JSON-serialized dict.
    Keys include 'total', 'succeeded', 'retried', 'failed', which are counters for the number of
@@ -70,7 +92,8 @@ def update_instructor_task_for_subtasks(entry, action_name, total_num, subtask_i
    rely on the status stored in the InstructorTask object, rather than status stored in the
    corresponding AsyncResult.
    """
-    progress = {
+    # TODO: also add 'pending' count here?  (Even though it's total-attempted-skipped
+    task_progress = {
        'action_name': action_name,
        'attempted': 0,
        'failed': 0,
@@ -80,22 +103,33 @@ def update_instructor_task_for_subtasks(entry, action_name, total_num, subtask_i
        'duration_ms': int(0),
        'start_time': time()
    }
-    entry.task_output = InstructorTask.create_output_for_success(progress)
+    entry.task_output = InstructorTask.create_output_for_success(task_progress)
    entry.task_state = PROGRESS

    # Write out the subtasks information.
    num_subtasks = len(subtask_id_list)
-    subtask_status = dict.fromkeys(subtask_id_list, QUEUING)
-    subtask_dict = {'total': num_subtasks, 'succeeded': 0, 'failed': 0, 'retried': 0, 'status': subtask_status}
+    # using fromkeys to initialize uses a single value.  we need the value
+    # to be distinct, since it's now a dict:
+    # subtask_status = dict.fromkeys(subtask_id_list, QUEUING)
+    # TODO: may not be necessary to store initial value with all those zeroes!
+    # Instead, use a placemarker....
+    subtask_status = {subtask_id: create_subtask_status() for subtask_id in subtask_id_list}
+    subtask_dict = {
+        'total': num_subtasks,
+        'succeeded': 0,
+        'failed': 0,
+        'retried': 0,
+        'status': subtask_status
+    }
    entry.subtasks = json.dumps(subtask_dict)

    # and save the entry immediately, before any subtasks actually start work:
    entry.save_now()
-    return progress
+    return task_progress


@transaction.commit_manually
-def update_subtask_status(entry_id, current_task_id, status, subtask_result):
+def update_subtask_status(entry_id, current_task_id, subtask_status):
    """
    Update the status of the subtask in the parent InstructorTask object tracking its progress.

@@ -104,7 +138,7 @@ def update_subtask_status(entry_id, current_task_id, status, subtask_result):
    committed on completion, or rolled back on error.

    The InstructorTask's "task_output" field is updated.  This is a JSON-serialized dict.
-    Accumulates values for 'attempted', 'succeeded', 'failed', 'skipped' from `subtask_result`
+    Accumulates values for 'attempted', 'succeeded', 'failed', 'skipped' from `subtask_progress`
    into the corresponding values in the InstructorTask's task_output.  Also updates the 'duration_ms'
    value with the current interval since the original InstructorTask started.

@@ -121,7 +155,7 @@ def update_subtask_status(entry_id, current_task_id, status, subtask_result):
    messages, progress made, etc.
    """
    TASK_LOG.info("Preparing to update status for email subtask %s for instructor task %d with status %s",
-                  current_task_id, entry_id, subtask_result)
+                  current_task_id, entry_id, subtask_status)

    try:
        entry = InstructorTask.objects.select_for_update().get(pk=entry_id)
@@ -140,28 +174,38 @@ def update_subtask_status(entry_id, current_task_id, status, subtask_result):
        # ultimate status to be clobbered by the "earlier" updates.  This
        # should not be a problem in normal (non-eager) processing.
        old_status = subtask_status[current_task_id]
-        if status != RETRY or old_status == QUEUING:
-            subtask_status[current_task_id] = status
+        # TODO: check this logic...
+        state = subtask_status['state']
+#        if state != RETRY or old_status['status'] == QUEUING:
+        # instead replace the status only if it's 'newer'
+        # i.e. has fewer pending
+        if subtask_status['pending'] <= old_status['pending']:
+            subtask_status[current_task_id] = subtask_status

        # Update the parent task progress
        task_progress = json.loads(entry.task_output)
        start_time = task_progress['start_time']
        task_progress['duration_ms'] = int((time() - start_time) * 1000)
-        if subtask_result is not None:
+        # change  behavior so we don't update on progress now:
+        # TODO: figure out if we can make this more responsive later,
+        # by figuring out how to handle retries better.
+        if subtask_status is not None and state in READY_STATES:
            for statname in ['attempted', 'succeeded', 'failed', 'skipped']:
-                task_progress[statname] += subtask_result[statname]
+                task_progress[statname] += subtask_status[statname]

        # Figure out if we're actually done (i.e. this is the last task to complete).
        # This is easier if we just maintain a counter, rather than scanning the
        # entire subtask_status dict.
-        if status == SUCCESS:
+        if state == SUCCESS:
            subtask_dict['succeeded'] += 1
-        elif status == RETRY:
+        elif state == RETRY:
            subtask_dict['retried'] += 1
        else:
            subtask_dict['failed'] += 1
        num_remaining = subtask_dict['total'] - subtask_dict['succeeded'] - subtask_dict['failed']
        # If we're done with the last task, update the parent status to indicate that:
+        # TODO: see if there was a catastrophic failure that occurred, and figure out
+        # how to report that here.
        if num_remaining <= 0:
            entry.task_state = SUCCESS
        entry.subtasks = json.dumps(subtask_dict)