fix: bson size error due to mongo aggregate func

fix below bson_obj document size error due to mongo aggregate function by calculating the count and using find instead of aggregate.
```
raise OperationFailure(errmsg, code, response, max_wire_version)
pymongo.errors.OperationFailure: BSONObj size: 19117457 (0x123B591) is invalid. Size must be between 0 and 16793600(16MB) First element: _id: null, full error: {'ok': 0.0, 'errmsg': 'BSONObj size: 19117457 (0x123B591) is invalid. Size must be between 0 and 16793600(16MB) First element: _id: null', 'code': 10334, 'codeName': 'BSONObjectTooLarge', '$clusterTime': {'clusterTime': Timestamp(1750073176, 2), 'signature': {'hash': b'\xd7\xdd\xf4\x7f\xe4\xcea\xa1\xa7P\xba\xab\xcb\x12\x7f1A$(\xb4', 'keyId': 7471973240714297345}}, 'operationTime': Timestamp(1750073176, 2)}
```
This commit is contained in:
Muhammad Faraz Maqsood
2025-06-17 17:41:08 +05:00
committed by Muhammad Faraz Maqsood
parent 9d6d8088b8
commit f86ea5cb07

View File

@@ -310,71 +310,47 @@ class MongoContentStore(ContentStore):
contentType: The mimetype string of the asset
md5: An md5 hash of the asset content
'''
# TODO: Using an aggregate() instead of a find() here is a hack to get around the fact that Mongo 3.2 does not
# support sorting case-insensitively.
# If a sort on displayname is requested, the aggregation pipeline creates a new field:
# `insensitive_displayname`, a lowercase version of `displayname` that is sorted on instead.
# Mongo 3.4 does not require this hack. When upgraded, change this aggregation back to a find and specifiy
# a collation based on user's language locale instead.
# See: https://openedx.atlassian.net/browse/EDUCATOR-2221
pipeline_stages = []
query = query_for_course(course_key, 'asset' if not get_thumbnails else 'thumbnail')
if filter_params:
query.update(filter_params)
pipeline_stages.append({'$match': query})
# Count total matching documents
count = self.fs_files.count_documents(query)
sort_list = []
if sort:
sort = dict(sort)
if 'displayname' in sort:
pipeline_stages.append({
'$project': {
'contentType': 1,
'locked': 1,
'chunkSize': 1,
'content_son': 1,
'displayname': 1,
'filename': 1,
'length': 1,
'import_path': 1,
'uploadDate': 1,
'thumbnail_location': 1,
'md5': 1,
'insensitive_displayname': {
'$toLower': '$displayname'
}
}
# Apply case-insensitive sorting
cursor = self.fs_files.find(query, {
'contentType': 1,
'locked': 1,
'chunkSize': 1,
'content_son': 1,
'displayname': 1,
'filename': 1,
'length': 1,
'import_path': 1,
'uploadDate': 1,
'thumbnail_location': 1,
'md5': 1
})
sort = {'insensitive_displayname': sort['displayname']}
pipeline_stages.append({'$sort': sort})
cursor = cursor.sort('displayname', sort['displayname']).collation({'locale': 'en', 'strength': 2})
else:
# Apply simple sorting
sort_list = list(sort.items())
cursor = self.fs_files.find(query).sort(sort_list)
else:
cursor = self.fs_files.find(query)
# This is another hack to get the total query result count, but only the Nth page of actual documents
# See: https://stackoverflow.com/a/39784851/6620612
pipeline_stages.append({'$group': {'_id': None, 'count': {'$sum': 1}, 'results': {'$push': '$$ROOT'}}})
# Apply pagination
if start > 0:
cursor = cursor.skip(start)
if maxresults > 0:
pipeline_stages.append({
'$project': {
'count': 1,
'results': {
'$slice': ['$results', start, maxresults]
}
}
})
cursor = cursor.limit(maxresults)
cursor = self.fs_files.aggregate(pipeline_stages)
# Set values if result of query is empty
count = 0
assets = []
try:
result = cursor.next()
if result:
count = result['count']
assets = list(result['results'])
except StopIteration:
# Skip if no assets were returned
pass
# We're constructing the asset key immediately after retrieval from the database so that
# callers are insulated from knowing how our identifiers are stored.
assets = list(cursor)
# Construct asset keys
for asset in assets:
asset_id = asset.get('content_son', asset['_id'])
asset['asset_key'] = course_key.make_asset_key(asset_id['category'], asset_id['name'])