Transcript upload backend for Video Upload Page – EDUCATOR-1854

This commit is contained in:
Qubad786
2017-12-22 19:18:03 +05:00
committed by Mushtaq Ali
parent c760c6a01b
commit 090e5dc534
7 changed files with 331 additions and 18 deletions

View File

@@ -2,6 +2,7 @@
""" Tests for transcripts_utils. """
import copy
import ddt
import json
import textwrap
import unittest
from uuid import uuid4
@@ -610,28 +611,53 @@ class TestTranscript(unittest.TestCase):
self.txt_transcript = u"Elephant's Dream\nAt the left we can see..."
def test_convert_srt_to_txt(self):
"""
Tests that the srt transcript is successfully converted into txt format.
"""
expected = self.txt_transcript
actual = transcripts_utils.Transcript.convert(self.srt_transcript, 'srt', 'txt')
self.assertEqual(actual, expected)
def test_convert_srt_to_srt(self):
"""
Tests that srt to srt conversion works as expected.
"""
expected = self.srt_transcript
actual = transcripts_utils.Transcript.convert(self.srt_transcript, 'srt', 'srt')
self.assertEqual(actual, expected)
def test_convert_sjson_to_txt(self):
"""
Tests that the sjson transcript is successfully converted into txt format.
"""
expected = self.txt_transcript
actual = transcripts_utils.Transcript.convert(self.sjson_transcript, 'sjson', 'txt')
self.assertEqual(actual, expected)
def test_convert_sjson_to_srt(self):
"""
Tests that the sjson transcript is successfully converted into srt format.
"""
expected = self.srt_transcript
actual = transcripts_utils.Transcript.convert(self.sjson_transcript, 'sjson', 'srt')
self.assertEqual(actual, expected)
def test_convert_srt_to_sjson(self):
with self.assertRaises(NotImplementedError):
transcripts_utils.Transcript.convert(self.srt_transcript, 'srt', 'sjson')
"""
Tests that the srt transcript is successfully converted into sjson format.
"""
expected = json.loads(self.sjson_transcript)
actual = transcripts_utils.Transcript.convert(self.srt_transcript, 'srt', 'sjson')
self.assertDictEqual(actual, expected)
def test_convert_invalid_srt_to_sjson(self):
"""
Tests that TranscriptsGenerationException was raises on trying
to convert invalid srt transcript to sjson.
"""
invalid_srt_transcript = 'invalid SubRip file content'
with self.assertRaises(transcripts_utils.TranscriptsGenerationException):
transcripts_utils.Transcript.convert(invalid_srt_transcript, 'srt', 'sjson')
def test_dummy_non_existent_transcript(self):
"""

View File

@@ -1,12 +1,15 @@
# -*- coding: utf-8 -*-
import ddt
import json
from mock import Mock, patch
from io import BytesIO
from mock import Mock, patch, ANY
from django.test.testcases import TestCase
from contentstore.tests.utils import CourseTestCase
from contentstore.utils import reverse_course_url
from contentstore.views.transcript_settings import TranscriptionProviderErrorType, validate_transcript_credentials
from openedx.core.djangoapps.profile_images.tests.helpers import make_image_file
@ddt.ddt
@@ -277,3 +280,153 @@ class TranscriptDownloadTest(CourseTestCase):
# Assert the response
self.assertEqual(response.status_code, 400)
self.assertEqual(json.loads(response.content)['error'], expected_error_message)
@ddt.ddt
@patch(
'openedx.core.djangoapps.video_config.models.VideoTranscriptEnabledFlag.feature_enabled',
Mock(return_value=True)
)
class TranscriptUploadTest(CourseTestCase):
"""
Tests for transcript upload handler.
"""
VIEW_NAME = 'transcript_upload_handler'
def get_url_for_course_key(self, course_id):
return reverse_course_url(self.VIEW_NAME, course_id)
def test_302_with_anonymous_user(self):
"""
Verify that redirection happens in case of unauthorized request.
"""
self.client.logout()
transcript_upload_url = self.get_url_for_course_key(self.course.id)
response = self.client.post(transcript_upload_url, content_type='application/json')
self.assertEqual(response.status_code, 302)
def test_405_with_not_allowed_request_method(self):
"""
Verify that 405 is returned in case of not-allowed request methods.
Allowed request methods include POST.
"""
transcript_upload_url = self.get_url_for_course_key(self.course.id)
response = self.client.get(transcript_upload_url, content_type='application/json')
self.assertEqual(response.status_code, 405)
def test_404_with_feature_disabled(self):
"""
Verify that 404 is returned if the corresponding feature is disabled.
"""
transcript_upload_url = self.get_url_for_course_key(self.course.id)
with patch('openedx.core.djangoapps.video_config.models.VideoTranscriptEnabledFlag.feature_enabled') as feature:
feature.return_value = False
response = self.client.post(transcript_upload_url, content_type='application/json')
self.assertEqual(response.status_code, 404)
@patch('contentstore.views.transcript_settings.create_or_update_video_transcript')
def test_transcript_upload_handler(self, mock_create_or_update_video_transcript):
"""
Tests that transcript upload handler works as expected.
"""
transcript_upload_url = self.get_url_for_course_key(self.course.id)
transcript_file_stream = BytesIO('0\n00:00:00,010 --> 00:00:00,100\nПривіт, edX вітає вас.\n\n')
# Make request to transcript upload handler
response = self.client.post(
transcript_upload_url,
{
'edx_video_id': '123',
'language_code': 'en',
'file': transcript_file_stream,
},
format='multipart'
)
self.assertEqual(response.status_code, 201)
mock_create_or_update_video_transcript.assert_called_with(
video_id='123',
language_code='en',
file_name='subs.sjson',
file_format='sjson',
provider='Custom',
file_data=ANY,
)
@ddt.data(
(
{
'edx_video_id': '123',
'language_code': 'en',
},
u'A transcript file is required.'
),
(
{
'language_code': u'en',
'file': u'0\n00:00:00,010 --> 00:00:00,100\nHi, welcome to Edx.\n\n'
},
u'Following parameters are required: edx_video_id.'
),
(
{
'file': u'0\n00:00:00,010 --> 00:00:00,100\nHi, welcome to Edx.\n\n'
},
u'Following parameters are required: edx_video_id, language_code.'
)
)
@ddt.unpack
def test_transcript_upload_handler_missing_attrs(self, request_payload, expected_error_message):
"""
Tests the transcript upload handler when the required attributes are missing.
"""
transcript_upload_url = self.get_url_for_course_key(self.course.id)
# Make request to transcript upload handler
response = self.client.post(transcript_upload_url, request_payload, format='multipart')
self.assertEqual(response.status_code, 400)
self.assertEqual(json.loads(response.content)['error'], expected_error_message)
def test_transcript_upload_handler_with_image(self):
"""
Tests the transcript upload handler with an image file.
"""
with make_image_file() as image_file:
transcript_upload_url = self.get_url_for_course_key(self.course.id)
# Make request to transcript upload handler
response = self.client.post(
transcript_upload_url,
{
'edx_video_id': '123',
'language_code': 'en',
'file': image_file,
},
format='multipart'
)
self.assertEqual(response.status_code, 400)
self.assertEqual(
json.loads(response.content)['error'],
u'There is a problem with this transcript file. Try to upload a different file.'
)
def test_transcript_upload_handler_with_invalid_transcript(self):
"""
Tests the transcript upload handler with an invalid transcript file.
"""
transcript_upload_url = self.get_url_for_course_key(self.course.id)
transcript_file_stream = BytesIO('An invalid transcript SubRip file content')
# Make request to transcript upload handler
response = self.client.post(
transcript_upload_url,
{
'edx_video_id': '123',
'language_code': 'en',
'file': transcript_file_stream,
},
format='multipart'
)
self.assertEqual(response.status_code, 400)
self.assertEqual(
json.loads(response.content)['error'],
u'There is a problem with this transcript file. Try to upload a different file.'
)

View File

@@ -2,12 +2,16 @@
Views related to the transcript preferences feature
"""
import os
import json
import logging
from django.contrib.auth.decorators import login_required
from django.core.files.base import ContentFile
from django.http import HttpResponseNotFound, HttpResponse
from django.utils.translation import ugettext as _
from django.views.decorators.http import require_POST, require_GET
from edxval.api import (
create_or_update_video_transcript,
get_3rd_party_transcription_plans,
get_video_transcript_data,
update_transcript_credentials_state_for_org,
@@ -19,9 +23,11 @@ from openedx.core.djangoapps.video_pipeline.api import update_3rd_party_transcri
from util.json_request import JsonResponse, expect_json
from contentstore.views.videos import TranscriptProvider
from xmodule.video_module.transcripts_utils import Transcript
from xmodule.video_module.transcripts_utils import Transcript, TranscriptsGenerationException
__all__ = ['transcript_credentials_handler', 'transcript_download_handler']
__all__ = ['transcript_credentials_handler', 'transcript_download_handler', 'transcript_upload_handler']
LOGGER = logging.getLogger(__name__)
class TranscriptionProviderErrorType:
@@ -146,12 +152,94 @@ def transcript_download_handler(request, course_key_string):
if transcript:
name_and_extension = os.path.splitext(transcript['file_name'])
basename, file_format = name_and_extension[0], name_and_extension[1][1:]
transcript_filename = '{base_name}.srt'.format(base_name=basename.encode('utf8'))
transcript_content = Transcript.convert(transcript['content'], input_format=file_format, output_format='srt')
transcript_filename = '{base_name}.{ext}'.format(base_name=basename.encode('utf8'), ext=Transcript.SRT)
transcript_content = Transcript.convert(
content=transcript['content'],
input_format=file_format,
output_format=Transcript.SRT
)
# Construct an HTTP response
response = HttpResponse(transcript_content, content_type=Transcript.mime_types['srt'])
response = HttpResponse(transcript_content, content_type=Transcript.mime_types[Transcript.SRT])
response['Content-Disposition'] = 'attachment; filename="{filename}"'.format(filename=transcript_filename)
else:
response = HttpResponseNotFound()
return response
def validate_transcript_upload_data(data, files):
"""
Validates video transcript file.
Arguments:
data: A request's data part.
files: A request's files part.
Returns:
None or String
If there is error returns error message otherwise None.
"""
error = None
# Validate the must have attributes - this error is unlikely to be faced by common users.
must_have_attrs = ['edx_video_id', 'language_code']
missing = [attr for attr in must_have_attrs if attr not in data]
if missing:
error = _(u'Following parameters are required: {missing}.').format(missing=', '.join(missing))
elif 'file' not in files:
error = _(u'A transcript file is required.')
return error
@login_required
@require_POST
def transcript_upload_handler(request, course_key_string):
"""
View to upload a transcript file.
Arguments:
request: A WSGI request object
course_key_string: Course key identifying a course
Transcript file, edx video id and transcript language are required.
Transcript file should be in SRT(SubRip) format.
Returns
- A 400 if any of the validation fails
- A 404 if the corresponding feature flag is disabled
- A 200 if transcript has been uploaded successfully
"""
# Check whether the feature is available for this course.
course_key = CourseKey.from_string(course_key_string)
if not VideoTranscriptEnabledFlag.feature_enabled(course_key):
return HttpResponseNotFound()
error = validate_transcript_upload_data(data=request.POST, files=request.FILES)
if error:
response = JsonResponse({'error': error}, status=400)
else:
edx_video_id = request.POST['edx_video_id']
language_code = request.POST['language_code']
transcript_file = request.FILES['file']
try:
# Convert SRT transcript into an SJSON format
# and upload it to S3.
sjson_subs = Transcript.convert(
content=transcript_file.read(),
input_format=Transcript.SRT,
output_format=Transcript.SJSON
)
create_or_update_video_transcript(
video_id=edx_video_id,
language_code=language_code,
file_name='subs.sjson',
file_format=Transcript.SJSON,
provider='Custom',
file_data=ContentFile(json.dumps(sjson_subs)),
)
response = JsonResponse(status=201)
except (TranscriptsGenerationException, UnicodeDecodeError):
response = JsonResponse(
{'error': _(u'There is a problem with this transcript file. Try to upload a different file.')},
status=400
)
return response

View File

@@ -34,6 +34,7 @@ from edxval.api import (
get_available_transcript_languages
)
from opaque_keys.edx.keys import CourseKey
from xmodule.video_module.transcripts_utils import Transcript
from contentstore.models import VideoUploadConfig
from contentstore.utils import reverse_course_url
@@ -73,9 +74,6 @@ VIDEO_UPLOAD_MAX_FILE_SIZE_GB = 5
# maximum time for video to remain in upload state
MAX_UPLOAD_HOURS = 24
# Transcript download format
TRANSCRIPT_DOWNLOAD_FILE_FORMAT = 'srt'
class TranscriptProvider(object):
"""
@@ -647,8 +645,12 @@ def videos_index_html(course):
'transcript_download_handler',
unicode(course.id)
),
'transcript_upload_handler_url': reverse_course_url(
'transcript_upload_handler',
unicode(course.id)
),
'transcription_plans': get_3rd_party_transcription_plans(),
'trancript_download_file_format': TRANSCRIPT_DOWNLOAD_FILE_FORMAT
'trancript_download_file_format': Transcript.SRT
}
context['active_transcript_preferences'] = get_transcript_preferences(unicode(course.id))
# Cached state for transcript providers' credentials (org-specific)

View File

@@ -22,7 +22,6 @@ define(
model: model,
transcriptAvailableLanguages: options.transcriptAvailableLanguages,
videoSupportedFileFormats: options.videoSupportedFileFormats,
videoTranscriptSettings: options.videoTranscriptSettings,
isVideoTranscriptEnabled: options.isVideoTranscriptEnabled
});
});

View File

@@ -144,6 +144,8 @@ urlpatterns = [
contentstore.views.transcript_credentials_handler, name='transcript_credentials_handler'),
url(r'^transcript_download/{}$'.format(settings.COURSE_KEY_PATTERN),
contentstore.views.transcript_download_handler, name='transcript_download_handler'),
url(r'^transcript_upload/{}$'.format(settings.COURSE_KEY_PATTERN),
contentstore.views.transcript_upload_handler, name='transcript_upload_handler'),
url(r'^video_encodings_download/{}$'.format(settings.COURSE_KEY_PATTERN),
contentstore.views.video_encodings_download, name='video_encodings_download'),
url(r'^group_configurations/{}$'.format(settings.COURSE_KEY_PATTERN),

View File

@@ -9,6 +9,7 @@ import json
import requests
import logging
from pysrt import SubRipTime, SubRipItem, SubRipFile
from pysrt.srtexc import Error
from lxml import etree
from HTMLParser import HTMLParser
@@ -284,6 +285,32 @@ def generate_srt_from_sjson(sjson_subs, speed):
return output
def generate_sjson_from_srt(srt_subs):
"""
Generate transcripts from sjson to SubRip (*.srt).
Arguments:
srt_subs(SubRip): "SRT" subs object
Returns:
Subs converted to "SJSON" format.
"""
sub_starts = []
sub_ends = []
sub_texts = []
for sub in srt_subs:
sub_starts.append(sub.start.ordinal)
sub_ends.append(sub.end.ordinal)
sub_texts.append(sub.text.replace('\n', ' '))
sjson_subs = {
'start': sub_starts,
'end': sub_ends,
'text': sub_texts
}
return sjson_subs
def copy_or_rename_transcript(new_name, old_name, item, delete_old=False, user=None):
"""
Renames `old_name` transcript file in storage to `new_name`.
@@ -544,10 +571,13 @@ class Transcript(object):
"""
Container for transcript methods.
"""
SRT = 'srt'
TXT = 'txt'
SJSON = 'sjson'
mime_types = {
'srt': 'application/x-subrip; charset=utf-8',
'txt': 'text/plain; charset=utf-8',
'sjson': 'application/json',
SRT: 'application/x-subrip; charset=utf-8',
TXT: 'text/plain; charset=utf-8',
SJSON: 'application/json',
}
@staticmethod
@@ -556,7 +586,10 @@ class Transcript(object):
Convert transcript `content` from `input_format` to `output_format`.
Accepted input formats: sjson, srt.
Accepted output format: srt, txt.
Accepted output format: srt, txt, sjson.
Raises:
TranscriptsGenerationException: On parsing the invalid srt content during conversion from srt to sjson.
"""
assert input_format in ('srt', 'sjson')
assert output_format in ('txt', 'srt', 'sjson')
@@ -571,7 +604,17 @@ class Transcript(object):
return HTMLParser().unescape(text)
elif output_format == 'sjson':
raise NotImplementedError
try:
# With error handling (set to 'ERROR_RAISE'), we will be getting
# the exception if something went wrong in parsing the transcript.
srt_subs = SubRipFile.from_string(
content.decode('utf8'),
error_handling=SubRipFile.ERROR_RAISE
)
except Error as ex: # Base exception from pysrt
raise TranscriptsGenerationException(ex.message)
return generate_sjson_from_srt(srt_subs)
if input_format == 'sjson':