fix: Always save generated anonymous user ID in DB; ignore save=False (#26399)

This deprecates `save=False` for several functions and removes all known
usages of the parameter but does not actually remove the parameter.
Instead, it will emit a deprecation warning if the parameter is used.
We can remove the parameter as soon as we feel sure nothing is using it.

Now that we have refactored `anonymous_id_for_user` to always prefer
retrieving an existing ID from the database -- and observed that only a
small fraction of calls pass save=False -- we can stop respecting
save=False. This opens the door for future improvements, such as generating
random IDs or switching to the external user ID system.

Metrics: I observe that 1 in 16 requests for new, non-request-cached
anon user IDs are made with save=False. But 71% of all calls are served
from the request cache, and 99.7% of the misses are served from the DB.
save=False only appear to come from intermittent spikes as reports are
generated and are low in absolute number.

Also document usage/risk/rotation of secret in anonymous user ID
generation as indicated by `docs/decisions/0008-secret-key-usage.rst`
ADR on `SECRET_KEY` usage.

ref: ARCHBOM-1683
This commit is contained in:
Tim McCormack
2021-02-08 19:16:05 +00:00
committed by GitHub
parent 54505b82c4
commit 80a4437f33
6 changed files with 62 additions and 31 deletions

View File

@@ -21,6 +21,7 @@ from datetime import datetime, timedelta
from functools import total_ordering
from importlib import import_module
from urllib.parse import urlencode
import warnings
import six
from config_models.models import ConfigurationModel
@@ -153,12 +154,12 @@ class AnonymousUserId(models.Model):
course_id = LearningContextKeyField(db_index=True, max_length=255, blank=True)
def anonymous_id_for_user(user, course_id, save=True):
def anonymous_id_for_user(user, course_id, save='DEPRECATED'):
"""
Inputs:
user: User model
course_id: string or None
save: Whether the id should be saved in an AnonymousUserId object, defaults to True
save: Deprecated and ignored: ID is always saved in an AnonymousUserId object
Return a unique id for a (user, course_id) pair, suitable for inserting
into e.g. personalized survey links.
@@ -171,28 +172,56 @@ def anonymous_id_for_user(user, course_id, save=True):
# This part is for ability to get xblock instance in xblock_noauth handlers, where user is unauthenticated.
assert user
if save != 'DEPRECATED':
warnings.warn(
"anonymous_id_for_user no longer accepts save param and now "
"always saves the ID in the database",
DeprecationWarning
)
if user.is_anonymous:
return None
# ARCHBOM-1674: Get a sense of what fraction of anonymous_user_id calls are
# cached, stored in the DB, or retrieved from the DB. This will help inform
# us on decisions about whether we can move to always save IDs,
# pregenerate them, use random instead of deterministic IDs, etc.
# us on decisions about whether we can
# pregenerate IDs, use random instead of deterministic IDs, etc.
monitoring.increment('temp_anon_uid_v2.requested')
cached_id = getattr(user, '_anonymous_id', {}).get(course_id)
if cached_id is not None:
monitoring.increment('temp_anon_uid_v2.returned_from_cache')
return cached_id
# check if an anonymous id already exists for this user and course_id combination
# Check if an anonymous id already exists for this user and
# course_id combination. Prefer the one with the highest record ID
# (see below.)
anonymous_user_ids = AnonymousUserId.objects.filter(user=user).filter(course_id=course_id).order_by('-id')
if anonymous_user_ids:
# If there are multiple anonymous_user_ids per user, course_id pair
# select the row which was created most recently
# select the row which was created most recently.
# There might be more than one if the Django SECRET_KEY had
# previously been rotated at a time before this function was
# changed to always save the generated IDs to the DB. In that
# case, just pick the one with the highest record ID, which is
# probably the most recently created one.
anonymous_user_id = anonymous_user_ids[0].anonymous_user_id
monitoring.increment('temp_anon_uid_v2.fetched_existing')
else:
# include the secret key as a salt, and to make the ids unique across different LMS installs.
# Uses SECRET_KEY as a cryptographic pepper. This
# deterministic ID generation means that concurrent identical
# calls to this function return the same value -- no need for
# locking. (There may be a low level of integrity errors on
# creation as a result of concurrent duplicate row inserts.)
#
# Consequences for this function of SECRET_KEY exposure: Data
# researchers and other third parties receiving these
# anonymous user IDs would be able to identify users across
# courses, and predict the anonymous user IDs of all users
# (but not necessarily identify their accounts.)
#
# Rotation process of SECRET_KEY with respect to this
# function: Rotate at will, since the hashes are stored and
# will not change.
hasher = hashlib.md5()
hasher.update(settings.SECRET_KEY.encode('utf8'))
hasher.update(text_type(user.id).encode('utf8'))
@@ -200,20 +229,17 @@ def anonymous_id_for_user(user, course_id, save=True):
hasher.update(text_type(course_id).encode('utf-8'))
anonymous_user_id = hasher.hexdigest()
if save is True:
try:
AnonymousUserId.objects.create(
user=user,
course_id=course_id,
anonymous_user_id=anonymous_user_id,
)
monitoring.increment('temp_anon_uid_v2.stored')
except IntegrityError:
# Another thread has already created this entry, so
# continue
monitoring.increment('temp_anon_uid_v2.store_db_error')
else:
monitoring.increment('temp_anon_uid_v2.computed_unsaved')
try:
AnonymousUserId.objects.create(
user=user,
course_id=course_id,
anonymous_user_id=anonymous_user_id,
)
monitoring.increment('temp_anon_uid_v2.stored')
except IntegrityError:
# Another thread has already created this entry, so
# continue
monitoring.increment('temp_anon_uid_v2.store_db_error')
# cache the anonymous_id in the user object
if not hasattr(user, '_anonymous_id'):
@@ -829,17 +855,23 @@ class UserSignupSource(models.Model):
site = models.CharField(max_length=255, db_index=True)
def unique_id_for_user(user, save=True):
def unique_id_for_user(user, save='DEPRECATED'):
"""
Return a unique id for a user, suitable for inserting into
e.g. personalized survey links.
Keyword arguments:
save -- Whether the id should be saved in an AnonymousUserId object.
save -- Deprecated and ignored: ID is always saved in an AnonymousUserId object
"""
if save != 'DEPRECATED':
warnings.warn(
"unique_id_for_user no longer accepts save param and now "
"always saves the ID in the database",
DeprecationWarning
)
# Setting course_id to '' makes it not affect the generated hash,
# and thus produce the old per-student anonymous id
return anonymous_id_for_user(user, None, save=save)
return anonymous_id_for_user(user, None)
# TODO: Should be renamed to generic UserGroup, and possibly

View File

@@ -999,7 +999,7 @@ class AnonymousLookupTable(ModuleStoreTestCase):
anonymous_id = anonymous_id_for_user(self.user, self.course.id)
real_user = user_by_anonymous_id(anonymous_id)
self.assertEqual(self.user, real_user)
self.assertEqual(anonymous_id, anonymous_id_for_user(self.user, self.course.id, save=False))
self.assertEqual(anonymous_id, anonymous_id_for_user(self.user, self.course.id))
def test_roundtrip_with_unicode_course_id(self):
course2 = CourseFactory.create(display_name=u"Omega Course Ω")
@@ -1007,7 +1007,7 @@ class AnonymousLookupTable(ModuleStoreTestCase):
anonymous_id = anonymous_id_for_user(self.user, course2.id)
real_user = user_by_anonymous_id(anonymous_id)
self.assertEqual(self.user, real_user)
self.assertEqual(anonymous_id, anonymous_id_for_user(self.user, course2.id, save=False))
self.assertEqual(anonymous_id, anonymous_id_for_user(self.user, course2.id))
def test_anonymous_id_secret_key_changes_do_not_change_existing_anonymous_ids(self):
"""Test that a same anonymous id is returned when the SECRET_KEY changes."""

View File

@@ -105,8 +105,7 @@ class UserServiceTestCase(TestCase):
course_key = CourseKey.from_string('edX/toy/2012_Fall')
anon_user_id = anonymous_id_for_user(
user=self.user,
course_id=course_key,
save=True
course_id=course_key
)
django_user_service = DjangoXBlockUserService(self.user, user_is_staff=True)

View File

@@ -66,7 +66,7 @@ class DjangoXBlockUserService(UserService):
return None
course_id = CourseKey.from_string(course_id)
return anonymous_id_for_user(user=user, course_id=course_id, save=False)
return anonymous_id_for_user(user=user, course_id=course_id)
def _convert_django_user_to_xblock_user(self, django_user):
"""

View File

@@ -1413,7 +1413,7 @@ def get_anon_ids(request, course_id):
courseenrollment__course_id=course_id,
).order_by('id')
header = ['User ID', 'Anonymized User ID', 'Course Specific Anonymized User ID']
rows = [[s.id, unique_id_for_user(s, save=False), anonymous_id_for_user(s, course_id, save=False)]
rows = [[s.id, unique_id_for_user(s), anonymous_id_for_user(s, course_id)]
for s in students]
return csv_response(text_type(course_id).replace('/', '-') + '-anon-ids.csv', header, rows)

View File

@@ -392,7 +392,7 @@ def anonymous_user_ids_for_team(user, team):
))
return sorted([
anonymous_id_for_user(user=team_member, course_id=team.course_id, save=True)
anonymous_id_for_user(user=team_member, course_id=team.course_id)
for team_member in team.users.all()
])