Files
edx-platform/cms/djangoapps/contentstore/transcripts_utils.py
2013-10-22 21:45:03 +03:00

342 lines
11 KiB
Python

"""
Utility functions for transcripts.
++++++++++++++++++++++++++++++++++
"""
import copy
import json
import requests
import logging
from pysrt import SubRipTime, SubRipItem, SubRipFile
from lxml import etree
from cache_toolbox.core import del_cached_content
from django.conf import settings
from xmodule.exceptions import NotFoundError
from xmodule.contentstore.content import StaticContent
from xmodule.contentstore.django import contentstore
from xmodule.modulestore import Location
from xmodule.modulestore.inheritance import own_metadata
from .utils import get_modulestore
log = logging.getLogger(__name__)
class TranscriptsGenerationException(Exception):
pass
class GetTranscriptsFromYouTubeException(Exception):
pass
class TranscriptsRequestValidationException(Exception):
pass
def generate_subs(speed, source_speed, source_subs):
"""
Generate transcripts from one speed to another speed.
Args:
`speed`: float, for this speed subtitles will be generated,
`source_speed`: float, speed of source_subs
`soource_subs`: dict, existing subtitles for speed `source_speed`.
Returns:
`subs`: dict, actual subtitles.
"""
if speed == source_speed:
return source_subs
coefficient = 1.0 * speed / source_speed
subs = {
'start': [
int(round(timestamp * coefficient)) for
timestamp in source_subs['start']
],
'end': [
int(round(timestamp * coefficient)) for
timestamp in source_subs['end']
],
'text': source_subs['text']}
return subs
def save_subs_to_store(subs, subs_id, item):
"""
Save transcripts into `StaticContent`.
Args:
`subs_id`: str, subtitles id
`item`: video module instance
Returns: location of saved subtitles.
"""
filedata = json.dumps(subs, indent=2)
mime_type = 'application/json'
filename = 'subs_{0}.srt.sjson'.format(subs_id)
content_location = StaticContent.compute_location(
item.location.org, item.location.course, filename
)
content = StaticContent(content_location, filename, mime_type, filedata)
contentstore().save(content)
del_cached_content(content_location)
return content_location
def get_transcripts_from_youtube(youtube_id):
"""
Gets transcripts from youtube for youtube_id.
Parses only utf-8 encoded transcripts.
Other encodings are not supported at the moment.
Returns (status, transcripts): bool, dict.
"""
utf8_parser = etree.XMLParser(encoding='utf-8')
youtube_api = copy.deepcopy(settings.YOUTUBE_API)
youtube_api['params']['v'] = youtube_id
data = requests.get(youtube_api['url'], params=youtube_api['params'])
if data.status_code != 200 or not data.text:
msg = "Can't receive transcripts from Youtube for {}. Status code: {}.".format(
youtube_id, data.status_code)
raise GetTranscriptsFromYouTubeException(msg)
sub_starts, sub_ends, sub_texts = [], [], []
xmltree = etree.fromstring(data.content, parser=utf8_parser)
for element in xmltree:
if element.tag == "text":
start = float(element.get("start"))
duration = float(element.get("dur", 0)) # dur is not mandatory
text = element.text
end = start + duration
if text:
# Start and end should be ints representing the millisecond timestamp.
sub_starts.append(int(start * 1000))
sub_ends.append(int((end + 0.0001) * 1000))
sub_texts.append(text.replace('\n', ' '))
return {'start': sub_starts, 'end': sub_ends, 'text': sub_texts}
def download_youtube_subs(youtube_subs, item):
"""
Download transcripts from Youtube and save them to assets.
Args:
youtube_subs: dictionary of `speed: youtube_id` key:value pairs.
item: video module instance.
Returns: None, if transcripts were successfully downloaded and saved.
Otherwise raises GetTranscriptsFromYouTubeException.
"""
highest_speed = highest_speed_subs = None
missed_speeds = []
# Iterate from lowest to highest speed and try to do download transcripts
# from the Youtube service.
for speed, youtube_id in sorted(youtube_subs.iteritems()):
if not youtube_id:
continue
try:
subs = get_transcripts_from_youtube(youtube_id)
if not subs: # if empty subs are returned
raise GetTranscriptsFromYouTubeException
except GetTranscriptsFromYouTubeException:
missed_speeds.append(speed)
continue
save_subs_to_store(subs, youtube_id, item)
log.info(
"Transcripts for YouTube id %s (speed %s)"
"are downloaded and saved.", youtube_id, speed
)
highest_speed = speed
highest_speed_subs = subs
if not highest_speed:
raise GetTranscriptsFromYouTubeException("Can't find any transcripts on the Youtube service.")
# When we exit from the previous loop, `highest_speed` and `highest_speed_subs`
# are the transcripts data for the highest speed available on the
# Youtube service. We use the highest speed as main speed for the
# generation other transcripts, cause during calculation timestamps
# for lower speeds we just use multiplication instead of division.
for speed in missed_speeds: # Generate transcripts for missed speeds.
save_subs_to_store(
generate_subs(speed, highest_speed, highest_speed_subs),
youtube_subs[speed],
item
)
log.info(
"Transcripts for YouTube id %s (speed %s)"
"are generated from YouTube id %s (speed %s) and saved",
youtube_subs[speed], speed,
youtube_subs[highest_speed],
highest_speed
)
def remove_subs_from_store(subs_id, item):
"""
Remove from store, if transcripts content exists.
"""
filename = 'subs_{0}.srt.sjson'.format(subs_id)
content_location = StaticContent.compute_location(
item.location.org, item.location.course, filename
)
try:
content = contentstore().find(content_location)
contentstore().delete(content.get_id())
log.info("Removed subs %s from store", subs_id)
except NotFoundError:
pass
def generate_subs_from_source(speed_subs, subs_type, subs_filedata, item):
"""Generate transcripts from source files (like SubRip format, etc.)
and save them to assets for `item` module.
We expect, that speed of source subs equal to 1
:param speed_subs: dictionary {speed: sub_id, ...}
:param subs_type: type of source subs: "srt", ...
:param subs_filedata:unicode, content of source subs.
:param item: module object.
:returns: True, if all subs are generated and saved successfully.
"""
if subs_type != 'srt':
raise TranscriptsGenerationException("We support only SubRip (*.srt) transcripts format.")
try:
srt_subs_obj = SubRipFile.from_string(subs_filedata)
except Exception as e:
raise TranscriptsGenerationException(
"Something wrong with SubRip transcripts file during parsing. "
"Inner message is {}".format(e.message)
)
if not srt_subs_obj:
raise TranscriptsGenerationException("Something wrong with SubRip transcripts file during parsing.")
sub_starts = []
sub_ends = []
sub_texts = []
for sub in srt_subs_obj:
sub_starts.append(sub.start.ordinal)
sub_ends.append(sub.end.ordinal)
sub_texts.append(sub.text.replace('\n', ' '))
subs = {
'start': sub_starts,
'end': sub_ends,
'text': sub_texts}
for speed, subs_id in speed_subs.iteritems():
save_subs_to_store(
generate_subs(speed, 1, subs),
subs_id,
item
)
return subs
def generate_srt_from_sjson(sjson_subs, speed):
"""Generate transcripts with speed = 1.0 from sjson to SubRip (*.srt).
:param sjson_subs: "sjson" subs.
:param speed: speed of `sjson_subs`.
:returns: "srt" subs.
"""
output = ''
equal_len = len(sjson_subs['start']) == len(sjson_subs['end']) == len(sjson_subs['text'])
if not equal_len:
return output
sjson_speed_1 = generate_subs(speed, 1, sjson_subs)
for i in range(len(sjson_speed_1['start'])):
item = SubRipItem(
index=i,
start=SubRipTime(milliseconds=sjson_speed_1['start'][i]),
end=SubRipTime(milliseconds=sjson_speed_1['end'][i]),
text=sjson_speed_1['text'][i]
)
output += (unicode(item))
output += '\n'
return output
def save_module(item):
"""
Proceed with additional save operations.
"""
item.save()
store = get_modulestore(Location(item.id))
store.update_metadata(item.id, own_metadata(item))
def copy_or_rename_transcript(new_name, old_name, item, delete_old=False):
"""
Renames `old_name` transcript file in storage to `new_name`.
If `old_name` is not found in storage, raises `NotFoundError`.
If `delete_old` is True, removes `old_name` files from storage.
"""
filename = 'subs_{0}.srt.sjson'.format(old_name)
content_location = StaticContent.compute_location(
item.location.org, item.location.course, filename
)
transcripts = contentstore().find(content_location).data
save_subs_to_store(json.loads(transcripts), new_name, item)
item.sub = new_name
save_module(item)
if delete_old:
remove_subs_from_store(old_name, item)
def manage_video_subtitles_save(old_item, new_item):
"""
Does some specific things, that can be done only on save.
Video player item has some video fields: HTML5 ones and Youtube one.
1. If value of `sub` field of `new_item` is different from values of video fields of `new_item`,
and `new_item.sub` file is present, then code in this function creates copies of
`new_item.sub` file with new names. That names are equal to values of video fields of `new_item`
After that `sub` field of `new_item` is changed to one of values of video fields.
This whole action ensures that after user changes video fields, proper `sub` files, corresponding
to new values of video fields, will be presented in system.
old_item is not used here, but is added for future changes.
"""
# 1.
# assume '.' and '/' are not in filenames
html5_ids = [x.split('/')[-1].split('.')[0] for x in new_item.html5_sources]
possible_video_id_list = [new_item.youtube_id_1_0] + html5_ids
sub_name = new_item.sub
for video_id in possible_video_id_list:
if not video_id:
continue
# copy_or_rename_transcript changes item.sub of module
try:
# updates item.sub with `video_id`, if it is successful.
copy_or_rename_transcript(video_id, sub_name, new_item)
except NotFoundError:
# subtitles file `sub_name` is not presented in the system. Nothing to copy or rename.
log.debug(
"Copying %s file content to %s name is failed, "
"original file does not exist.",
sub_name, video_id
)