From f86ea5cb07b8a90cacbe0b8962cc82c202b7e6c8 Mon Sep 17 00:00:00 2001 From: Muhammad Faraz Maqsood Date: Tue, 17 Jun 2025 17:41:08 +0500 Subject: [PATCH] fix: bson size error due to mongo aggregate func fix below bson_obj document size error due to mongo aggregate function by calculating the count and using find instead of aggregate. ``` raise OperationFailure(errmsg, code, response, max_wire_version) pymongo.errors.OperationFailure: BSONObj size: 19117457 (0x123B591) is invalid. Size must be between 0 and 16793600(16MB) First element: _id: null, full error: {'ok': 0.0, 'errmsg': 'BSONObj size: 19117457 (0x123B591) is invalid. Size must be between 0 and 16793600(16MB) First element: _id: null', 'code': 10334, 'codeName': 'BSONObjectTooLarge', '$clusterTime': {'clusterTime': Timestamp(1750073176, 2), 'signature': {'hash': b'\xd7\xdd\xf4\x7f\xe4\xcea\xa1\xa7P\xba\xab\xcb\x12\x7f1A$(\xb4', 'keyId': 7471973240714297345}}, 'operationTime': Timestamp(1750073176, 2)} ``` --- xmodule/contentstore/mongo.py | 84 +++++++++++++---------------------- 1 file changed, 30 insertions(+), 54 deletions(-) diff --git a/xmodule/contentstore/mongo.py b/xmodule/contentstore/mongo.py index e44f03cede..b436474f17 100644 --- a/xmodule/contentstore/mongo.py +++ b/xmodule/contentstore/mongo.py @@ -310,71 +310,47 @@ class MongoContentStore(ContentStore): contentType: The mimetype string of the asset md5: An md5 hash of the asset content ''' - # TODO: Using an aggregate() instead of a find() here is a hack to get around the fact that Mongo 3.2 does not - # support sorting case-insensitively. - # If a sort on displayname is requested, the aggregation pipeline creates a new field: - # `insensitive_displayname`, a lowercase version of `displayname` that is sorted on instead. - # Mongo 3.4 does not require this hack. When upgraded, change this aggregation back to a find and specifiy - # a collation based on user's language locale instead. - # See: https://openedx.atlassian.net/browse/EDUCATOR-2221 - pipeline_stages = [] query = query_for_course(course_key, 'asset' if not get_thumbnails else 'thumbnail') if filter_params: query.update(filter_params) - pipeline_stages.append({'$match': query}) + # Count total matching documents + count = self.fs_files.count_documents(query) + + sort_list = [] if sort: sort = dict(sort) if 'displayname' in sort: - pipeline_stages.append({ - '$project': { - 'contentType': 1, - 'locked': 1, - 'chunkSize': 1, - 'content_son': 1, - 'displayname': 1, - 'filename': 1, - 'length': 1, - 'import_path': 1, - 'uploadDate': 1, - 'thumbnail_location': 1, - 'md5': 1, - 'insensitive_displayname': { - '$toLower': '$displayname' - } - } + # Apply case-insensitive sorting + cursor = self.fs_files.find(query, { + 'contentType': 1, + 'locked': 1, + 'chunkSize': 1, + 'content_son': 1, + 'displayname': 1, + 'filename': 1, + 'length': 1, + 'import_path': 1, + 'uploadDate': 1, + 'thumbnail_location': 1, + 'md5': 1 }) - sort = {'insensitive_displayname': sort['displayname']} - pipeline_stages.append({'$sort': sort}) + cursor = cursor.sort('displayname', sort['displayname']).collation({'locale': 'en', 'strength': 2}) + else: + # Apply simple sorting + sort_list = list(sort.items()) + cursor = self.fs_files.find(query).sort(sort_list) + else: + cursor = self.fs_files.find(query) - # This is another hack to get the total query result count, but only the Nth page of actual documents - # See: https://stackoverflow.com/a/39784851/6620612 - pipeline_stages.append({'$group': {'_id': None, 'count': {'$sum': 1}, 'results': {'$push': '$$ROOT'}}}) + # Apply pagination + if start > 0: + cursor = cursor.skip(start) if maxresults > 0: - pipeline_stages.append({ - '$project': { - 'count': 1, - 'results': { - '$slice': ['$results', start, maxresults] - } - } - }) + cursor = cursor.limit(maxresults) - cursor = self.fs_files.aggregate(pipeline_stages) - # Set values if result of query is empty - count = 0 - assets = [] - try: - result = cursor.next() - if result: - count = result['count'] - assets = list(result['results']) - except StopIteration: - # Skip if no assets were returned - pass - - # We're constructing the asset key immediately after retrieval from the database so that - # callers are insulated from knowing how our identifiers are stored. + assets = list(cursor) + # Construct asset keys for asset in assets: asset_id = asset.get('content_son', asset['_id']) asset['asset_key'] = course_key.make_asset_key(asset_id['category'], asset_id['name'])