fix: Always save generated anonymous user ID in DB; ignore save=False (#26399)
This deprecates `save=False` for several functions and removes all known usages of the parameter but does not actually remove the parameter. Instead, it will emit a deprecation warning if the parameter is used. We can remove the parameter as soon as we feel sure nothing is using it. Now that we have refactored `anonymous_id_for_user` to always prefer retrieving an existing ID from the database -- and observed that only a small fraction of calls pass save=False -- we can stop respecting save=False. This opens the door for future improvements, such as generating random IDs or switching to the external user ID system. Metrics: I observe that 1 in 16 requests for new, non-request-cached anon user IDs are made with save=False. But 71% of all calls are served from the request cache, and 99.7% of the misses are served from the DB. save=False only appear to come from intermittent spikes as reports are generated and are low in absolute number. Also document usage/risk/rotation of secret in anonymous user ID generation as indicated by `docs/decisions/0008-secret-key-usage.rst` ADR on `SECRET_KEY` usage. ref: ARCHBOM-1683
This commit is contained in:
@@ -21,6 +21,7 @@ from datetime import datetime, timedelta
|
||||
from functools import total_ordering
|
||||
from importlib import import_module
|
||||
from urllib.parse import urlencode
|
||||
import warnings
|
||||
|
||||
import six
|
||||
from config_models.models import ConfigurationModel
|
||||
@@ -153,12 +154,12 @@ class AnonymousUserId(models.Model):
|
||||
course_id = LearningContextKeyField(db_index=True, max_length=255, blank=True)
|
||||
|
||||
|
||||
def anonymous_id_for_user(user, course_id, save=True):
|
||||
def anonymous_id_for_user(user, course_id, save='DEPRECATED'):
|
||||
"""
|
||||
Inputs:
|
||||
user: User model
|
||||
course_id: string or None
|
||||
save: Whether the id should be saved in an AnonymousUserId object, defaults to True
|
||||
save: Deprecated and ignored: ID is always saved in an AnonymousUserId object
|
||||
|
||||
Return a unique id for a (user, course_id) pair, suitable for inserting
|
||||
into e.g. personalized survey links.
|
||||
@@ -171,28 +172,56 @@ def anonymous_id_for_user(user, course_id, save=True):
|
||||
# This part is for ability to get xblock instance in xblock_noauth handlers, where user is unauthenticated.
|
||||
assert user
|
||||
|
||||
if save != 'DEPRECATED':
|
||||
warnings.warn(
|
||||
"anonymous_id_for_user no longer accepts save param and now "
|
||||
"always saves the ID in the database",
|
||||
DeprecationWarning
|
||||
)
|
||||
|
||||
if user.is_anonymous:
|
||||
return None
|
||||
|
||||
# ARCHBOM-1674: Get a sense of what fraction of anonymous_user_id calls are
|
||||
# cached, stored in the DB, or retrieved from the DB. This will help inform
|
||||
# us on decisions about whether we can move to always save IDs,
|
||||
# pregenerate them, use random instead of deterministic IDs, etc.
|
||||
# us on decisions about whether we can
|
||||
# pregenerate IDs, use random instead of deterministic IDs, etc.
|
||||
monitoring.increment('temp_anon_uid_v2.requested')
|
||||
|
||||
cached_id = getattr(user, '_anonymous_id', {}).get(course_id)
|
||||
if cached_id is not None:
|
||||
monitoring.increment('temp_anon_uid_v2.returned_from_cache')
|
||||
return cached_id
|
||||
# check if an anonymous id already exists for this user and course_id combination
|
||||
# Check if an anonymous id already exists for this user and
|
||||
# course_id combination. Prefer the one with the highest record ID
|
||||
# (see below.)
|
||||
anonymous_user_ids = AnonymousUserId.objects.filter(user=user).filter(course_id=course_id).order_by('-id')
|
||||
if anonymous_user_ids:
|
||||
# If there are multiple anonymous_user_ids per user, course_id pair
|
||||
# select the row which was created most recently
|
||||
# select the row which was created most recently.
|
||||
# There might be more than one if the Django SECRET_KEY had
|
||||
# previously been rotated at a time before this function was
|
||||
# changed to always save the generated IDs to the DB. In that
|
||||
# case, just pick the one with the highest record ID, which is
|
||||
# probably the most recently created one.
|
||||
anonymous_user_id = anonymous_user_ids[0].anonymous_user_id
|
||||
monitoring.increment('temp_anon_uid_v2.fetched_existing')
|
||||
else:
|
||||
# include the secret key as a salt, and to make the ids unique across different LMS installs.
|
||||
# Uses SECRET_KEY as a cryptographic pepper. This
|
||||
# deterministic ID generation means that concurrent identical
|
||||
# calls to this function return the same value -- no need for
|
||||
# locking. (There may be a low level of integrity errors on
|
||||
# creation as a result of concurrent duplicate row inserts.)
|
||||
#
|
||||
# Consequences for this function of SECRET_KEY exposure: Data
|
||||
# researchers and other third parties receiving these
|
||||
# anonymous user IDs would be able to identify users across
|
||||
# courses, and predict the anonymous user IDs of all users
|
||||
# (but not necessarily identify their accounts.)
|
||||
#
|
||||
# Rotation process of SECRET_KEY with respect to this
|
||||
# function: Rotate at will, since the hashes are stored and
|
||||
# will not change.
|
||||
hasher = hashlib.md5()
|
||||
hasher.update(settings.SECRET_KEY.encode('utf8'))
|
||||
hasher.update(text_type(user.id).encode('utf8'))
|
||||
@@ -200,20 +229,17 @@ def anonymous_id_for_user(user, course_id, save=True):
|
||||
hasher.update(text_type(course_id).encode('utf-8'))
|
||||
anonymous_user_id = hasher.hexdigest()
|
||||
|
||||
if save is True:
|
||||
try:
|
||||
AnonymousUserId.objects.create(
|
||||
user=user,
|
||||
course_id=course_id,
|
||||
anonymous_user_id=anonymous_user_id,
|
||||
)
|
||||
monitoring.increment('temp_anon_uid_v2.stored')
|
||||
except IntegrityError:
|
||||
# Another thread has already created this entry, so
|
||||
# continue
|
||||
monitoring.increment('temp_anon_uid_v2.store_db_error')
|
||||
else:
|
||||
monitoring.increment('temp_anon_uid_v2.computed_unsaved')
|
||||
try:
|
||||
AnonymousUserId.objects.create(
|
||||
user=user,
|
||||
course_id=course_id,
|
||||
anonymous_user_id=anonymous_user_id,
|
||||
)
|
||||
monitoring.increment('temp_anon_uid_v2.stored')
|
||||
except IntegrityError:
|
||||
# Another thread has already created this entry, so
|
||||
# continue
|
||||
monitoring.increment('temp_anon_uid_v2.store_db_error')
|
||||
|
||||
# cache the anonymous_id in the user object
|
||||
if not hasattr(user, '_anonymous_id'):
|
||||
@@ -829,17 +855,23 @@ class UserSignupSource(models.Model):
|
||||
site = models.CharField(max_length=255, db_index=True)
|
||||
|
||||
|
||||
def unique_id_for_user(user, save=True):
|
||||
def unique_id_for_user(user, save='DEPRECATED'):
|
||||
"""
|
||||
Return a unique id for a user, suitable for inserting into
|
||||
e.g. personalized survey links.
|
||||
|
||||
Keyword arguments:
|
||||
save -- Whether the id should be saved in an AnonymousUserId object.
|
||||
save -- Deprecated and ignored: ID is always saved in an AnonymousUserId object
|
||||
"""
|
||||
if save != 'DEPRECATED':
|
||||
warnings.warn(
|
||||
"unique_id_for_user no longer accepts save param and now "
|
||||
"always saves the ID in the database",
|
||||
DeprecationWarning
|
||||
)
|
||||
# Setting course_id to '' makes it not affect the generated hash,
|
||||
# and thus produce the old per-student anonymous id
|
||||
return anonymous_id_for_user(user, None, save=save)
|
||||
return anonymous_id_for_user(user, None)
|
||||
|
||||
|
||||
# TODO: Should be renamed to generic UserGroup, and possibly
|
||||
|
||||
@@ -999,7 +999,7 @@ class AnonymousLookupTable(ModuleStoreTestCase):
|
||||
anonymous_id = anonymous_id_for_user(self.user, self.course.id)
|
||||
real_user = user_by_anonymous_id(anonymous_id)
|
||||
self.assertEqual(self.user, real_user)
|
||||
self.assertEqual(anonymous_id, anonymous_id_for_user(self.user, self.course.id, save=False))
|
||||
self.assertEqual(anonymous_id, anonymous_id_for_user(self.user, self.course.id))
|
||||
|
||||
def test_roundtrip_with_unicode_course_id(self):
|
||||
course2 = CourseFactory.create(display_name=u"Omega Course Ω")
|
||||
@@ -1007,7 +1007,7 @@ class AnonymousLookupTable(ModuleStoreTestCase):
|
||||
anonymous_id = anonymous_id_for_user(self.user, course2.id)
|
||||
real_user = user_by_anonymous_id(anonymous_id)
|
||||
self.assertEqual(self.user, real_user)
|
||||
self.assertEqual(anonymous_id, anonymous_id_for_user(self.user, course2.id, save=False))
|
||||
self.assertEqual(anonymous_id, anonymous_id_for_user(self.user, course2.id))
|
||||
|
||||
def test_anonymous_id_secret_key_changes_do_not_change_existing_anonymous_ids(self):
|
||||
"""Test that a same anonymous id is returned when the SECRET_KEY changes."""
|
||||
|
||||
@@ -105,8 +105,7 @@ class UserServiceTestCase(TestCase):
|
||||
course_key = CourseKey.from_string('edX/toy/2012_Fall')
|
||||
anon_user_id = anonymous_id_for_user(
|
||||
user=self.user,
|
||||
course_id=course_key,
|
||||
save=True
|
||||
course_id=course_key
|
||||
)
|
||||
|
||||
django_user_service = DjangoXBlockUserService(self.user, user_is_staff=True)
|
||||
|
||||
@@ -66,7 +66,7 @@ class DjangoXBlockUserService(UserService):
|
||||
return None
|
||||
|
||||
course_id = CourseKey.from_string(course_id)
|
||||
return anonymous_id_for_user(user=user, course_id=course_id, save=False)
|
||||
return anonymous_id_for_user(user=user, course_id=course_id)
|
||||
|
||||
def _convert_django_user_to_xblock_user(self, django_user):
|
||||
"""
|
||||
|
||||
@@ -1413,7 +1413,7 @@ def get_anon_ids(request, course_id):
|
||||
courseenrollment__course_id=course_id,
|
||||
).order_by('id')
|
||||
header = ['User ID', 'Anonymized User ID', 'Course Specific Anonymized User ID']
|
||||
rows = [[s.id, unique_id_for_user(s, save=False), anonymous_id_for_user(s, course_id, save=False)]
|
||||
rows = [[s.id, unique_id_for_user(s), anonymous_id_for_user(s, course_id)]
|
||||
for s in students]
|
||||
return csv_response(text_type(course_id).replace('/', '-') + '-anon-ids.csv', header, rows)
|
||||
|
||||
|
||||
@@ -392,7 +392,7 @@ def anonymous_user_ids_for_team(user, team):
|
||||
))
|
||||
|
||||
return sorted([
|
||||
anonymous_id_for_user(user=team_member, course_id=team.course_id, save=True)
|
||||
anonymous_id_for_user(user=team_member, course_id=team.course_id)
|
||||
for team_member in team.users.all()
|
||||
])
|
||||
|
||||
|
||||
Reference in New Issue
Block a user