fix: libraries performance problem

This is an attempt to fix a performance problem on the libraries home page. When you go to studio home and click on the libraries tab, on prod it will be quick for admins but extremely slow for course instructors (> 12 seconds) and leads to timeouts. It grows with the number of libraries that are assigned to the instructor.

The Python code for the request to load libraries for a particular user goes through all existing libraries and then checks all of the user's roles for each library, which results in a complexity of O(l*r), l=libraries, r=roles. This PR improves the complexity to O(l).

The BulkRoleCache and RoleCache classes were using a python set to store all roles for a particular user. A user can have a large number of roles, and lookup speed of iterating through a set is slow (O(n)). Most roles don't have the same course id, however. So if you have the course id of the role you're looking for, we can use a dict of course ids that contain related roles. The number of roles per course id is negligible, so we arrive at a lookup speed of O(1) when looking up a user's roles that belong to a specific course id.

The BulkRoleCache now caches and stores user roles in a data structure like this:
    {
        user_id_1: {
            course_id_1: {role1, role2, role3},  # Set of roles associated with course_id_1
            course_id_2: {role4, role5, role6},  # Set of roles associated with course_id_2
            [ROLE_CACHE_UNGROUPED_ROLES_KEY]: {role7, role8}  # Set of roles not tied to any specific course or library. For example, Global Staff roles.
        },
        user_id_2: { ... }  # Similar structure for another user
    }

While this changes the data structure used to store roles under the hood and adds the new property `roles_by_course_id` to the RoleCache,
when initializing the RoleCache will store roles additionally in the previous data structure - as a flat set - in the `_roles` property accessible via `all_roles_set`. This establishes
backwards compatibility.

We are now storing roles twice in the RoleCache (in each of the two data structures), which means this takes twice as much memory, but only in the scope of a request.
This commit is contained in:
Jesper Hodge
2024-05-20 14:37:58 -04:00
committed by Jesper Hodge
parent e7b4db0862
commit e95d7e7e32
6 changed files with 157 additions and 20 deletions

View File

@@ -215,4 +215,5 @@ class HomePageLibrariesView(APIView):
library_context = get_library_context(request)
serializer = LibraryTabSerializer(library_context)
return Response(serializer.data)

View File

@@ -52,7 +52,7 @@ from common.djangoapps.student.roles import (
CourseStaffRole,
GlobalStaff,
UserBasedRole,
OrgStaffRole
OrgStaffRole,
)
from common.djangoapps.util.json_request import JsonResponse, JsonResponseBadRequest, expect_json
from common.djangoapps.util.string_utils import _has_non_ascii_characters

View File

@@ -23,7 +23,7 @@ from common.djangoapps.student.roles import (
OrgContentCreatorRole,
OrgInstructorRole,
OrgLibraryUserRole,
OrgStaffRole
OrgStaffRole,
)
# Studio permissions:
@@ -106,6 +106,7 @@ def get_user_permissions(user, course_key, org=None):
# Staff have all permissions except EDIT_ROLES:
if OrgStaffRole(org=org).has_user(user) or (course_key and user_has_role(user, CourseStaffRole(course_key))):
return STUDIO_VIEW_USERS | STUDIO_EDIT_CONTENT | STUDIO_VIEW_CONTENT
# Otherwise, for libraries, users can view only:
if course_key and isinstance(course_key, LibraryLocator):
if OrgLibraryUserRole(org=org).has_user(user) or user_has_role(user, LibraryUserRole(course_key)):

View File

@@ -75,4 +75,4 @@ def get_course_roles(user: User) -> list[CourseAccessRole]:
"""
# pylint: disable=protected-access
role_cache = get_role_cache(user)
return list(role_cache._roles)
return list(role_cache.all_roles_set)

View File

@@ -4,9 +4,9 @@ adding users, removing users, and listing members
"""
from collections import defaultdict
import logging
from abc import ABCMeta, abstractmethod
from collections import defaultdict
from contextlib import contextmanager
from django.contrib.auth.models import User # lint-amnesty, pylint: disable=imported-auth-user
@@ -23,6 +23,9 @@ REGISTERED_ACCESS_ROLES = {}
# A mapping of roles to the roles that they inherit permissions from.
ACCESS_ROLES_INHERITANCE = {}
# The key used to store roles for a user in the cache that do not belong to a course or do not have a course id.
ROLE_CACHE_UNGROUPED_ROLES__KEY = 'ungrouped'
def register_access_role(cls):
"""
@@ -60,21 +63,52 @@ def strict_role_checking():
ACCESS_ROLES_INHERITANCE.update(OLD_ACCESS_ROLES_INHERITANCE)
def get_role_cache_key_for_course(course_key=None):
"""
Get the cache key for the course key.
"""
return str(course_key) if course_key else ROLE_CACHE_UNGROUPED_ROLES__KEY
class BulkRoleCache: # lint-amnesty, pylint: disable=missing-class-docstring
"""
This class provides a caching mechanism for roles grouped by users and courses,
using a nested dictionary structure to optimize lookup performance. The cache structure is designed as follows:
{
user_id_1: {
course_id_1: {role1, role2, role3}, # Set of roles associated with course_id_1
course_id_2: {role4, role5, role6}, # Set of roles associated with course_id_2
[ROLE_CACHE_UNGROUPED_ROLES_KEY]: {role7, role8} # Set of roles not tied to any specific course or library
},
user_id_2: { ... } # Similar structure for another user
}
- Each top-level dictionary entry keys by `user_id` to access role data for a specific user.
- Nested within each user's dictionary, entries are keyed by `course_id` grouping roles by course.
- The special key `ROLE_CACHE_UNGROUPED_ROLES_KEY` (a constant defined above)
stores roles that are not associated with any specific course or library.
"""
CACHE_NAMESPACE = "student.roles.BulkRoleCache"
CACHE_KEY = 'roles_by_user'
@classmethod
def prefetch(cls, users): # lint-amnesty, pylint: disable=missing-function-docstring
roles_by_user = defaultdict(set)
roles_by_user = defaultdict(lambda: defaultdict(set))
get_cache(cls.CACHE_NAMESPACE)[cls.CACHE_KEY] = roles_by_user
for role in CourseAccessRole.objects.filter(user__in=users).select_related('user'):
roles_by_user[role.user.id].add(role)
user_id = role.user.id
course_id = get_role_cache_key_for_course(role.course_id)
# Add role to the set in roles_by_user[user_id][course_id]
user_roles_set_for_course = roles_by_user[user_id][course_id]
user_roles_set_for_course.add(role)
users_without_roles = [u for u in users if u.id not in roles_by_user]
for user in users_without_roles:
roles_by_user[user.id] = set()
roles_by_user[user.id] = {}
@classmethod
def get_user_roles(cls, user):
@@ -83,15 +117,32 @@ class BulkRoleCache: # lint-amnesty, pylint: disable=missing-class-docstring
class RoleCache:
"""
A cache of the CourseAccessRoles held by a particular user
A cache of the CourseAccessRoles held by a particular user.
Internal data structures should be accessed by getter and setter methods;
don't use `_roles_by_course_id` or `_roles` directly.
_roles_by_course_id: This is the data structure as saved in the RequestCache.
It contains all roles for a user as a dict that's keyed by course_id.
The key ROLE_CACHE_UNGROUPED_ROLES__KEY is used for all roles
that are not associated with a course.
_roles: This is a set of all roles for a user, ungrouped. It's used for some types of
lookups and collected from _roles_by_course_id on initialization
so that it doesn't need to be recalculated.
"""
def __init__(self, user):
try:
self._roles = BulkRoleCache.get_user_roles(user)
self._roles_by_course_id = BulkRoleCache.get_user_roles(user)
except KeyError:
self._roles = set(
CourseAccessRole.objects.filter(user=user).all()
)
self._roles_by_course_id = {}
roles = CourseAccessRole.objects.filter(user=user).all()
for role in roles:
course_id = get_role_cache_key_for_course(role.course_id)
if not self._roles_by_course_id.get(course_id):
self._roles_by_course_id[course_id] = set()
self._roles_by_course_id[course_id].add(role)
self._roles = set()
for roles_for_course in self._roles_by_course_id.values():
self._roles.update(roles_for_course)
@staticmethod
def get_roles(role):
@@ -100,16 +151,24 @@ class RoleCache:
"""
return ACCESS_ROLES_INHERITANCE.get(role, set()) | {role}
@property
def all_roles_set(self):
return self._roles
@property
def roles_by_course_id(self):
return self._roles_by_course_id
def has_role(self, role, course_id, org):
"""
Return whether this RoleCache contains a role with the specified role
or a role that inherits from the specified role, course_id and org.
"""
course_id_string = get_role_cache_key_for_course(course_id)
course_roles = self._roles_by_course_id.get(course_id_string, [])
return any(
access_role.role in self.get_roles(role) and
access_role.course_id == course_id and
access_role.org == org
for access_role in self._roles
access_role.role in self.get_roles(role) and access_role.org == org
for access_role in course_roles
)

View File

@@ -6,6 +6,7 @@ Tests of student.roles
import ddt
from django.test import TestCase
from opaque_keys.edx.keys import CourseKey
from opaque_keys.edx.locator import LibraryLocator
from common.djangoapps.student.roles import (
CourseAccessRole,
@@ -22,7 +23,9 @@ from common.djangoapps.student.roles import (
OrgContentCreatorRole,
OrgInstructorRole,
OrgStaffRole,
RoleCache
RoleCache,
get_role_cache_key_for_course,
ROLE_CACHE_UNGROUPED_ROLES__KEY
)
from common.djangoapps.student.role_helpers import get_course_roles, has_staff_roles
from common.djangoapps.student.tests.factories import AnonymousUserFactory, InstructorFactory, StaffFactory, UserFactory
@@ -35,7 +38,7 @@ class RolesTestCase(TestCase):
def setUp(self):
super().setUp()
self.course_key = CourseKey.from_string('edX/toy/2012_Fall')
self.course_key = CourseKey.from_string('course-v1:course-v1:edX+toy+2012_Fall')
self.course_loc = self.course_key.make_usage_key('course', '2012_Fall')
self.anonymous_user = AnonymousUserFactory()
self.student = UserFactory()
@@ -189,8 +192,9 @@ class RolesTestCase(TestCase):
@ddt.ddt
class RoleCacheTestCase(TestCase): # lint-amnesty, pylint: disable=missing-class-docstring
IN_KEY = CourseKey.from_string('edX/toy/2012_Fall')
NOT_IN_KEY = CourseKey.from_string('edX/toy/2013_Fall')
IN_KEY_STRING = 'course-v1:edX+toy+2012_Fall'
IN_KEY = CourseKey.from_string(IN_KEY_STRING)
NOT_IN_KEY = CourseKey.from_string('course-v1:edX+toy+2013_Fall')
ROLES = (
(CourseStaffRole(IN_KEY), ('staff', IN_KEY, 'edX')),
@@ -233,3 +237,75 @@ class RoleCacheTestCase(TestCase): # lint-amnesty, pylint: disable=missing-clas
def test_empty_cache(self, role, target): # lint-amnesty, pylint: disable=unused-argument
cache = RoleCache(self.user)
assert not cache.has_role(*target)
def test_get_role_cache_key_for_course_for_course_object_gets_string(self):
"""
Given a valid course key object, get_role_cache_key_for_course
should return the string representation of the key.
"""
course_string = 'course-v1:edX+toy+2012_Fall'
key = CourseKey.from_string(course_string)
key = get_role_cache_key_for_course(key)
assert key == course_string
def test_get_role_cache_key_for_course_for_undefined_object_returns_default(self):
"""
Given a value None, get_role_cache_key_for_course
should return the default key for ungrouped courses.
"""
key = get_role_cache_key_for_course(None)
assert key == ROLE_CACHE_UNGROUPED_ROLES__KEY
def test_role_cache_get_roles_set(self):
"""
Test that the RoleCache.all_roles_set getter method returns a flat set of all roles for a user
and that the ._roles attribute is the same as the set to avoid legacy behavior being broken.
"""
lib0 = LibraryLocator.from_string('library-v1:edX+quizzes')
course0 = CourseKey.from_string('course-v1:edX+toy+2012_Summer')
course1 = CourseKey.from_string('course-v1:edX+toy2+2013_Fall')
role_library_v1 = LibraryUserRole(lib0)
role_course_0 = CourseInstructorRole(course0)
role_course_1 = CourseInstructorRole(course1)
role_library_v1.add_users(self.user)
role_course_0.add_users(self.user)
role_course_1.add_users(self.user)
cache = RoleCache(self.user)
assert cache.has_role('library_user', lib0, 'edX')
assert cache.has_role('instructor', course0, 'edX')
assert cache.has_role('instructor', course1, 'edX')
assert len(cache.all_roles_set) == 3
roles_set = cache.all_roles_set
for role in roles_set:
assert role.course_id.course in ('quizzes', 'toy2', 'toy')
assert roles_set == cache._roles # pylint: disable=protected-access
def test_role_cache_roles_by_course_id(self):
"""
Test that the RoleCache.roles_by_course_id getter method returns a dictionary of roles for a user
that are grouped by course_id or if ungrouped by the ROLE_CACHE_UNGROUPED_ROLES__KEY.
"""
lib0 = LibraryLocator.from_string('library-v1:edX+quizzes')
course0 = CourseKey.from_string('course-v1:edX+toy+2012_Summer')
course1 = CourseKey.from_string('course-v1:edX+toy2+2013_Fall')
role_library_v1 = LibraryUserRole(lib0)
role_course_0 = CourseInstructorRole(course0)
role_course_1 = CourseInstructorRole(course1)
role_org_staff = OrgStaffRole('edX')
role_library_v1.add_users(self.user)
role_course_0.add_users(self.user)
role_course_1.add_users(self.user)
role_org_staff.add_users(self.user)
cache = RoleCache(self.user)
roles_dict = cache.roles_by_course_id
assert len(roles_dict) == 4
assert roles_dict.get(ROLE_CACHE_UNGROUPED_ROLES__KEY).pop().role == 'staff'
assert roles_dict.get('library-v1:edX+quizzes').pop().course_id.course == 'quizzes'
assert roles_dict.get('course-v1:edX+toy+2012_Summer').pop().course_id.course == 'toy'
assert roles_dict.get('course-v1:edX+toy2+2013_Fall').pop().course_id.course == 'toy2'