""" Utility functions for transcripts. ++++++++++++++++++++++++++++++++++ """ import copy import json import requests import logging from pysrt import SubRipTime, SubRipItem, SubRipFile from lxml import etree from cache_toolbox.core import del_cached_content from django.conf import settings from xmodule.exceptions import NotFoundError from xmodule.contentstore.content import StaticContent from xmodule.contentstore.django import contentstore from xmodule.modulestore import Location from xmodule.modulestore.inheritance import own_metadata from .utils import get_modulestore log = logging.getLogger(__name__) class TranscriptsGenerationException(Exception): pass class GetTranscriptsFromYouTubeException(Exception): pass class TranscriptsRequestValidationException(Exception): pass def generate_subs(speed, source_speed, source_subs): """ Generate transcripts from one speed to another speed. Args: `speed`: float, for this speed subtitles will be generated, `source_speed`: float, speed of source_subs `soource_subs`: dict, existing subtitles for speed `source_speed`. Returns: `subs`: dict, actual subtitles. """ if speed == source_speed: return source_subs coefficient = 1.0 * speed / source_speed subs = { 'start': [ int(round(timestamp * coefficient)) for timestamp in source_subs['start'] ], 'end': [ int(round(timestamp * coefficient)) for timestamp in source_subs['end'] ], 'text': source_subs['text']} return subs def save_subs_to_store(subs, subs_id, item): """ Save transcripts into `StaticContent`. Args: `subs_id`: str, subtitles id `item`: video module instance Returns: location of saved subtitles. """ filedata = json.dumps(subs, indent=2) mime_type = 'application/json' filename = 'subs_{0}.srt.sjson'.format(subs_id) content_location = StaticContent.compute_location( item.location.org, item.location.course, filename ) content = StaticContent(content_location, filename, mime_type, filedata) contentstore().save(content) del_cached_content(content_location) return content_location def get_transcripts_from_youtube(youtube_id): """ Gets transcripts from youtube for youtube_id. Parses only utf-8 encoded transcripts. Other encodings are not supported at the moment. Returns (status, transcripts): bool, dict. """ utf8_parser = etree.XMLParser(encoding='utf-8') youtube_api = copy.deepcopy(settings.YOUTUBE_API) youtube_api['params']['v'] = youtube_id data = requests.get(youtube_api['url'], params=youtube_api['params']) if data.status_code != 200 or not data.text: msg = "Can't receive transcripts from Youtube for {}. Status code: {}.".format( youtube_id, data.status_code) raise GetTranscriptsFromYouTubeException(msg) sub_starts, sub_ends, sub_texts = [], [], [] xmltree = etree.fromstring(data.content, parser=utf8_parser) for element in xmltree: if element.tag == "text": start = float(element.get("start")) duration = float(element.get("dur", 0)) # dur is not mandatory text = element.text end = start + duration if text: # Start and end should be ints representing the millisecond timestamp. sub_starts.append(int(start * 1000)) sub_ends.append(int((end + 0.0001) * 1000)) sub_texts.append(text.replace('\n', ' ')) return {'start': sub_starts, 'end': sub_ends, 'text': sub_texts} def download_youtube_subs(youtube_subs, item): """ Download transcripts from Youtube and save them to assets. Args: youtube_subs: dictionary of `speed: youtube_id` key:value pairs. item: video module instance. Returns: None, if transcripts were successfully downloaded and saved. Otherwise raises GetTranscriptsFromYouTubeException. """ highest_speed = highest_speed_subs = None missed_speeds = [] # Iterate from lowest to highest speed and try to do download transcripts # from the Youtube service. for speed, youtube_id in sorted(youtube_subs.iteritems()): if not youtube_id: continue try: subs = get_transcripts_from_youtube(youtube_id) if not subs: # if empty subs are returned raise GetTranscriptsFromYouTubeException except GetTranscriptsFromYouTubeException: missed_speeds.append(speed) continue save_subs_to_store(subs, youtube_id, item) log.info( "Transcripts for YouTube id %s (speed %s)" "are downloaded and saved.", youtube_id, speed ) highest_speed = speed highest_speed_subs = subs if not highest_speed: raise GetTranscriptsFromYouTubeException("Can't find any transcripts on the Youtube service.") # When we exit from the previous loop, `highest_speed` and `highest_speed_subs` # are the transcripts data for the highest speed available on the # Youtube service. We use the highest speed as main speed for the # generation other transcripts, cause during calculation timestamps # for lower speeds we just use multiplication instead of division. for speed in missed_speeds: # Generate transcripts for missed speeds. save_subs_to_store( generate_subs(speed, highest_speed, highest_speed_subs), youtube_subs[speed], item ) log.info( "Transcripts for YouTube id %s (speed %s)" "are generated from YouTube id %s (speed %s) and saved", youtube_subs[speed], speed, youtube_subs[highest_speed], highest_speed ) def remove_subs_from_store(subs_id, item): """ Remove from store, if transcripts content exists. """ filename = 'subs_{0}.srt.sjson'.format(subs_id) content_location = StaticContent.compute_location( item.location.org, item.location.course, filename ) try: content = contentstore().find(content_location) contentstore().delete(content.get_id()) log.info("Removed subs %s from store", subs_id) except NotFoundError: pass def generate_subs_from_source(speed_subs, subs_type, subs_filedata, item): """Generate transcripts from source files (like SubRip format, etc.) and save them to assets for `item` module. We expect, that speed of source subs equal to 1 :param speed_subs: dictionary {speed: sub_id, ...} :param subs_type: type of source subs: "srt", ... :param subs_filedata:unicode, content of source subs. :param item: module object. :returns: True, if all subs are generated and saved successfully. """ if subs_type != 'srt': raise TranscriptsGenerationException("We support only SubRip (*.srt) transcripts format.") try: srt_subs_obj = SubRipFile.from_string(subs_filedata) except Exception as e: raise TranscriptsGenerationException( "Something wrong with SubRip transcripts file during parsing. " "Inner message is {}".format(e.message) ) if not srt_subs_obj: raise TranscriptsGenerationException("Something wrong with SubRip transcripts file during parsing.") sub_starts = [] sub_ends = [] sub_texts = [] for sub in srt_subs_obj: sub_starts.append(sub.start.ordinal) sub_ends.append(sub.end.ordinal) sub_texts.append(sub.text.replace('\n', ' ')) subs = { 'start': sub_starts, 'end': sub_ends, 'text': sub_texts} for speed, subs_id in speed_subs.iteritems(): save_subs_to_store( generate_subs(speed, 1, subs), subs_id, item ) return subs def generate_srt_from_sjson(sjson_subs, speed): """Generate transcripts with speed = 1.0 from sjson to SubRip (*.srt). :param sjson_subs: "sjson" subs. :param speed: speed of `sjson_subs`. :returns: "srt" subs. """ output = '' equal_len = len(sjson_subs['start']) == len(sjson_subs['end']) == len(sjson_subs['text']) if not equal_len: return output sjson_speed_1 = generate_subs(speed, 1, sjson_subs) for i in range(len(sjson_speed_1['start'])): item = SubRipItem( index=i, start=SubRipTime(milliseconds=sjson_speed_1['start'][i]), end=SubRipTime(milliseconds=sjson_speed_1['end'][i]), text=sjson_speed_1['text'][i] ) output += (unicode(item)) output += '\n' return output def save_module(item): """ Proceed with additional save operations. """ item.save() store = get_modulestore(Location(item.id)) store.update_metadata(item.id, own_metadata(item)) def copy_or_rename_transcript(new_name, old_name, item, delete_old=False): """ Renames `old_name` transcript file in storage to `new_name`. If `old_name` is not found in storage, raises `NotFoundError`. If `delete_old` is True, removes `old_name` files from storage. """ filename = 'subs_{0}.srt.sjson'.format(old_name) content_location = StaticContent.compute_location( item.location.org, item.location.course, filename ) transcripts = contentstore().find(content_location).data save_subs_to_store(json.loads(transcripts), new_name, item) item.sub = new_name save_module(item) if delete_old: remove_subs_from_store(old_name, item) def manage_video_subtitles_save(old_item, new_item): """ Does some specific things, that can be done only on save. Video player item has some video fields: HTML5 ones and Youtube one. 1. If value of `sub` field of `new_item` is different from values of video fields of `new_item`, and `new_item.sub` file is present, then code in this function creates copies of `new_item.sub` file with new names. That names are equal to values of video fields of `new_item` After that `sub` field of `new_item` is changed to one of values of video fields. This whole action ensures that after user changes video fields, proper `sub` files, corresponding to new values of video fields, will be presented in system. old_item is not used here, but is added for future changes. """ # 1. # assume '.' and '/' are not in filenames html5_ids = [x.split('/')[-1].split('.')[0] for x in new_item.html5_sources] possible_video_id_list = [new_item.youtube_id_1_0] + html5_ids sub_name = new_item.sub for video_id in possible_video_id_list: if not video_id: continue # copy_or_rename_transcript changes item.sub of module try: # updates item.sub with `video_id`, if it is successful. copy_or_rename_transcript(video_id, sub_name, new_item) except NotFoundError: # subtitles file `sub_name` is not presented in the system. Nothing to copy or rename. log.debug( "Copying %s file content to %s name is failed, " "original file does not exist.", sub_name, video_id )