Files
edx-platform/cms/djangoapps/contentstore/views/transcript_settings.py
Régis Behmo 97399cf97c Fix TypeError during transcript upload to S3
On a platform that is configured to upload video transcripts to S3
(`DEFAULT_FILE_STORAGE = "storages.backends.s3boto3.S3Boto3Storage"`),
uploads from the studio fail with a TypeError: "Unicode-objects must be
encoded before hashing"

A full stacktrace of the issue can be found here:
https://sentry.overhang.io/share/issue/2249b6f67d794c7e986cc288758f4ebe/

This error is triggered by md5 hashing in the botocore library, which
itself is used by the S3Boto3Storage storage class. This error does not
occur with filesystem-based uploads because it does not perform checksum
verification. The reason why this error would not occur on edx.org is
unknown. Similar issues were already fixed from edxval.

To address this issue, we encode the transcript file content prior to
sending it to s3.
2021-01-25 12:23:31 +01:00

282 lines
10 KiB
Python

"""
Views related to the transcript preferences feature
"""
import logging
import os
from django.contrib.auth.decorators import login_required
from django.core.files.base import ContentFile
from django.http import HttpResponse, HttpResponseNotFound
from django.utils.translation import ugettext as _
from django.views.decorators.http import require_GET, require_http_methods, require_POST
from edxval.api import (
create_or_update_video_transcript,
delete_video_transcript,
get_3rd_party_transcription_plans,
get_available_transcript_languages,
get_video_transcript_data,
update_transcript_credentials_state_for_org
)
from opaque_keys.edx.keys import CourseKey
from openedx.core.djangoapps.video_config.models import VideoTranscriptEnabledFlag
from openedx.core.djangoapps.video_pipeline.api import update_3rd_party_transcription_service_credentials
from common.djangoapps.student.auth import has_studio_write_access
from common.djangoapps.util.json_request import JsonResponse, expect_json
from xmodule.video_module.transcripts_utils import Transcript, TranscriptsGenerationException
from .videos import TranscriptProvider
__all__ = [
'transcript_credentials_handler',
'transcript_download_handler',
'transcript_upload_handler',
'transcript_delete_handler'
]
LOGGER = logging.getLogger(__name__)
class TranscriptionProviderErrorType:
"""
Transcription provider's error types enumeration.
"""
INVALID_CREDENTIALS = 1
def validate_transcript_credentials(provider, **credentials):
"""
Validates transcript credentials.
Validations:
Providers must be either 3PlayMedia or Cielo24.
In case of:
3PlayMedia - 'api_key' and 'api_secret_key' are required.
Cielo24 - 'api_key' and 'username' are required.
It ignores any extra/unrelated parameters passed in credentials and
only returns the validated ones.
"""
error_message, validated_credentials = '', {}
valid_providers = list(get_3rd_party_transcription_plans().keys())
if provider in valid_providers:
must_have_props = []
if provider == TranscriptProvider.THREE_PLAY_MEDIA:
must_have_props = ['api_key', 'api_secret_key']
elif provider == TranscriptProvider.CIELO24:
must_have_props = ['api_key', 'username']
missing = [
must_have_prop for must_have_prop in must_have_props if must_have_prop not in list(credentials.keys())
]
if missing:
error_message = u'{missing} must be specified.'.format(missing=' and '.join(missing))
return error_message, validated_credentials
validated_credentials.update({
prop: credentials[prop] for prop in must_have_props
})
else:
error_message = u'Invalid Provider {provider}.'.format(provider=provider)
return error_message, validated_credentials
@expect_json
@login_required
@require_POST
def transcript_credentials_handler(request, course_key_string):
"""
JSON view handler to update the transcript organization credentials.
Arguments:
request: WSGI request object
course_key_string: A course identifier to extract the org.
Returns:
- A 200 response if credentials are valid and successfully updated in edx-video-pipeline.
- A 404 response if transcript feature is not enabled for this course.
- A 400 if credentials do not pass validations, hence not updated in edx-video-pipeline.
"""
course_key = CourseKey.from_string(course_key_string)
if not VideoTranscriptEnabledFlag.feature_enabled(course_key):
return HttpResponseNotFound()
provider = request.json.pop('provider')
error_message, validated_credentials = validate_transcript_credentials(provider=provider, **request.json)
if error_message:
response = JsonResponse({'error': error_message}, status=400)
else:
# Send the validated credentials to edx-video-pipeline and video-encode-manager
credentials_payload = dict(validated_credentials, org=course_key.org, provider=provider)
error_response, is_updated = update_3rd_party_transcription_service_credentials(**credentials_payload)
# Send appropriate response based on whether credentials were updated or not.
if is_updated:
# Cache credentials state in edx-val.
update_transcript_credentials_state_for_org(org=course_key.org, provider=provider, exists=is_updated)
response = JsonResponse(status=200)
else:
# Error response would contain error types and the following
# error type is received from edx-video-pipeline whenever we've
# got invalid credentials for a provider. Its kept this way because
# edx-video-pipeline doesn't support i18n translations yet.
error_type = error_response.get('error_type')
if error_type == TranscriptionProviderErrorType.INVALID_CREDENTIALS:
error_message = _('The information you entered is incorrect.')
response = JsonResponse({'error': error_message}, status=400)
return response
@login_required
@require_GET
def transcript_download_handler(request):
"""
JSON view handler to download a transcript.
Arguments:
request: WSGI request object
Returns:
- A 200 response with SRT transcript file attached.
- A 400 if there is a validation error.
- A 404 if there is no such transcript.
"""
missing = [attr for attr in ['edx_video_id', 'language_code'] if attr not in request.GET]
if missing:
return JsonResponse(
{'error': _(u'The following parameters are required: {missing}.').format(missing=', '.join(missing))},
status=400
)
edx_video_id = request.GET['edx_video_id']
language_code = request.GET['language_code']
transcript = get_video_transcript_data(video_id=edx_video_id, language_code=language_code)
if transcript:
name_and_extension = os.path.splitext(transcript['file_name'])
basename, file_format = name_and_extension[0], name_and_extension[1][1:]
transcript_filename = '{base_name}.{ext}'.format(base_name=basename, ext=Transcript.SRT)
transcript_content = Transcript.convert(
content=transcript['content'],
input_format=file_format,
output_format=Transcript.SRT
)
# Construct an HTTP response
response = HttpResponse(transcript_content, content_type=Transcript.mime_types[Transcript.SRT])
response['Content-Disposition'] = u'attachment; filename="{filename}"'.format(filename=transcript_filename)
else:
response = HttpResponseNotFound()
return response
def validate_transcript_upload_data(data, files):
"""
Validates video transcript file.
Arguments:
data: A request's data part.
files: A request's files part.
Returns:
None or String
If there is error returns error message otherwise None.
"""
error = None
# Validate the must have attributes - this error is unlikely to be faced by common users.
must_have_attrs = ['edx_video_id', 'language_code', 'new_language_code']
missing = [attr for attr in must_have_attrs if attr not in data]
if missing:
error = _(u'The following parameters are required: {missing}.').format(missing=', '.join(missing))
elif (
data['language_code'] != data['new_language_code'] and
data['new_language_code'] in get_available_transcript_languages(video_id=data['edx_video_id'])
):
error = _(u'A transcript with the "{language_code}" language code already exists.'.format(
language_code=data['new_language_code']
))
elif 'file' not in files:
error = _(u'A transcript file is required.')
return error
@login_required
@require_POST
def transcript_upload_handler(request):
"""
View to upload a transcript file.
Arguments:
request: A WSGI request object
Transcript file, edx video id and transcript language are required.
Transcript file should be in SRT(SubRip) format.
Returns
- A 400 if any of the validation fails
- A 200 if transcript has been uploaded successfully
"""
error = validate_transcript_upload_data(data=request.POST, files=request.FILES)
if error:
response = JsonResponse({'error': error}, status=400)
else:
edx_video_id = request.POST['edx_video_id']
language_code = request.POST['language_code']
new_language_code = request.POST['new_language_code']
transcript_file = request.FILES['file']
try:
# Convert SRT transcript into an SJSON format
# and upload it to S3.
sjson_subs = Transcript.convert(
content=transcript_file.read().decode('utf-8'),
input_format=Transcript.SRT,
output_format=Transcript.SJSON
).encode()
create_or_update_video_transcript(
video_id=edx_video_id,
language_code=language_code,
metadata={
'provider': TranscriptProvider.CUSTOM,
'file_format': Transcript.SJSON,
'language_code': new_language_code
},
file_data=ContentFile(sjson_subs),
)
response = JsonResponse(status=201)
except (TranscriptsGenerationException, UnicodeDecodeError):
response = JsonResponse(
{'error': _(u'There is a problem with this transcript file. Try to upload a different file.')},
status=400
)
return response
@login_required
@require_http_methods(["DELETE"])
def transcript_delete_handler(request, course_key_string, edx_video_id, language_code):
"""
View to delete a transcript file.
Arguments:
request: A WSGI request object
course_key_string: Course key identifying a course.
edx_video_id: edX video identifier whose transcript need to be deleted.
language_code: transcript's language code.
Returns
- A 404 if the user does not have required permisions
- A 200 if transcript is deleted without any error(s)
"""
# Check whether the feature is available for this course.
course_key = CourseKey.from_string(course_key_string)
# User needs to have studio write access for this course.
if not has_studio_write_access(request.user, course_key):
return HttpResponseNotFound()
delete_video_transcript(video_id=edx_video_id, language_code=language_code)
return JsonResponse(status=200)