From 1ac6e1230434e8c8de7c3ebb4f0079c30ed45cbc Mon Sep 17 00:00:00 2001 From: Slater-Victoroff Date: Wed, 5 Jun 2013 10:22:18 -0400 Subject: [PATCH] Pyes working, considering switch to raw requests, phonetic and fuzzy search both working --- common/djangoapps/search/__init__.py | 0 common/djangoapps/search/index.py | 82 ++++++++++++++++++++++++++ common/djangoapps/search/mapping.json | 24 ++++++++ common/djangoapps/search/views.py | 83 +++++++++++++++++++++++++++ 4 files changed, 189 insertions(+) create mode 100644 common/djangoapps/search/__init__.py create mode 100644 common/djangoapps/search/index.py create mode 100644 common/djangoapps/search/mapping.json create mode 100644 common/djangoapps/search/views.py diff --git a/common/djangoapps/search/__init__.py b/common/djangoapps/search/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/common/djangoapps/search/index.py b/common/djangoapps/search/index.py new file mode 100644 index 0000000000..7012def8cb --- /dev/null +++ b/common/djangoapps/search/index.py @@ -0,0 +1,82 @@ +import os +import os.path as pt +import json +import re +import string + +from pyes import * +import nltk.stem.snowball as snowball +import fuzzy + + +def grab_transcripts(sjson_directory): + """Returns referenes to all of the files contained within a subs directory""" + all_children = [child for child in os.listdir(sjson_directory)] + all_transcripts = [child for child in all_children if pt.isfile(pt.join(sjson_directory, child))] + # . is not a valid character for a youtube id, so it can be reliably used to pick up the start + # of the file extension + uuids = [transcript_id[:transcript_id.find(".")] for transcript_id in all_transcripts] + parsed_transcripts = [open(pt.join(sjson_directory, transcript)).read() for transcript in all_transcripts] + return zip([clean_transcript(transcript) for transcript in parsed_transcripts], uuids) + + +def clean_transcript(transcript_string): + """Tries to parse and clean a raw transcript. Errors for invalid sjson""" + transcript_list = filter(None, json.loads(transcript_string)['text']) + relevant_text = " ".join([phrase.encode('utf-8').strip() for phrase in transcript_list]) + relevant_text = relevant_text.lower().translate(None, string.punctuation) + cleanedText = re.sub('\n', " ", relevant_text) + return cleanedText + + +def phonetic_transcript(clean_transcript, stemmer): + return " ".join([phoneticize(word, stemmer) for word in clean_transcript.split(" ")]) + + +def phoneticize(word, stemmer): + encode = lambda word: word.decode('utf-8').encode('ascii', 'ignore') + phonetic = lambda word: fuzzy.nysiis(stemmer.stem(encode(word))) + return phonetic(word) + + +def initialize_transcripts(database, mapping): + database.indices.create_index("transcript-index") + + +def index_course(database, sjson_directory, course_name, mapping): + stemmer = snowball.EnglishStemmer() + database.put_mapping(course_name, {'properties': mapping}, "transcript-index") + all_transcripts = grab_transcripts(sjson_directory) + video_counter = 0 + for transcript_tuple in all_transcripts: + data_map = {"searchable_text": transcript_tuple[0], "uuid": transcript_tuple[1]} + data_map['phonetic_text'] = phonetic_transcript(transcript_tuple[0], stemmer) + database.index(data_map, "transcript-index", course_name) + video_counter += 1 + database.indices.refresh("transcript-index") + + +def fuzzy_search(database, query, course_name): + search_query = FuzzyLikeThisFieldQuery("searchable_text", query) + return database.search(query=search_query, indices="transcript-index") + + +def phonetic_search(database, query, course_name): + stemmer = snowball.EnglishStemmer() + search_query = TextQuery("phonetic_text", phoneticize(query, stemmer)) + return database.search(query=search_query, indices="transcript-index") + + +data_directory = '/Users/climatologist/edx_all/data/content-mit-6002x/static/subs/' +mapping_directory = 'mapping.json' +database = ES('127.0.0.1:9200') +mapping = json.loads(open(mapping_directory, 'rb').read()) + +#initialize_transcripts(database, mapping) +#index_course(database, data_directory, "test-course", mapping) +fuzzy_results = fuzzy_search(database, "gaussian", "test-course") +phonetic_results = phonetic_search(database, "gaussian", "test-course") +for r in fuzzy_results: + print "Fuzzy: " + r['uuid'] +for r in phonetic_results: + print "Phonetic: " + r['uuid'] diff --git a/common/djangoapps/search/mapping.json b/common/djangoapps/search/mapping.json new file mode 100644 index 0000000000..8d4deade6d --- /dev/null +++ b/common/djangoapps/search/mapping.json @@ -0,0 +1,24 @@ +{ + + "searchable_text": { + "boost": 1.0, + "index": "analyzed", + "store": "yes", + "type": "string", + "term_vector": "with_positions_offsets" + }, + + "phonetic_text": { + "boost": 1.0, + "index": "analyzed", + "store": "yes", + "type": "string", + "term_vector": "with_positions_offsets" + }, + + "uuid": { + "index": "not_analyzed", + "store": "yes", + "type": "string" + } +} \ No newline at end of file diff --git a/common/djangoapps/search/views.py b/common/djangoapps/search/views.py new file mode 100644 index 0000000000..f15dc9266b --- /dev/null +++ b/common/djangoapps/search/views.py @@ -0,0 +1,83 @@ +from django.http import HttpResponse +from django.template.loader import get_template +from django.template import Context +from django.contrib.auth.models import User +from django.contrib.staticfiles import finders +from courseware.courses import get_courses +from courseware.model_data import ModelDataCache +from courseware.module_render import get_module_for_descriptor + +from courseware.views import registered_for_course +#import logging +import lxml +import re +import posixpath +import urllib +from os import listdir +from os.path import isfile +from os.path import join + + +def test(request): + user = User.objects.prefetch_related("groups").get(id=request.user.id) + request.user = user + + course_list = get_courses(user, request.META.get('HTTP_HOST')) + + all_modules = [get_module(request, user, course) for course in course_list if registered_for_course(course, user)] + child_modules = [] + for module in all_modules: + child_modules.extend(module.get_children()) + bottom_modules = [] + for module in child_modules: + bottom_modules.extend(module.get_children()) + asset_divs = get_asset_div(convert_to_valid_html(bottom_modules[2].get_html())) + strings = [get_transcript_directory(lxml.html.tostring(div)) for div in asset_divs] + search_template = get_template('search.html') + html = search_template.render(Context({'course_list': strings})) + return HttpResponse(html) + + +def get_children(course): + """Returns the children of a given course""" + attributes = [child.location for child in course._child_instances] + return attributes + + +def convert_to_valid_html(html): + replacement = {"<": "<", ">": ">", """: "\"", "'": "'"} + for i, j in replacement.iteritems(): + html = html.replace(i, j) + return html + + +def get_asset_div(html_page): + return lxml.html.find_class(html_page, "video") + + +def get_module(request, user, course): + model_data_cache = ModelDataCache.cache_for_descriptor_descendents(course.id, user, course, depth=2) + course_module = get_module_for_descriptor(user, request, course, model_data_cache, course.id) + return course_module + + +def get_youtube_code(module_html): + youtube_snippet = re.sub(r'(.*?)(1\.0:)(.*?)(,1\.25)(.*)', r'\3', module_html) + sliced_youtube_code = youtube_snippet[:youtube_snippet.find('\n')] + return sliced_youtube_code + + +def get_transcript_directory(module_html): + directory_snippet = re.sub(r'(.*?)(data-caption-asset-path=\")(.*?)(\">.*)', r'\3', module_html) + sliced_directory = directory_snippet[:directory_snippet.find('\n')] + return resolve_to_absolute_path(sliced_directory) + + +def resolve_to_absolute_path(transcript_directory): + normalized_path = posixpath.normpath(urllib.unquote(transcript_directory)).lstrip('/') + return all_transcript_files(normalized_path) + + +def all_transcript_files(normalized_path): + files = [transcript for transcript in listdir(normalized_path) if isfile(join(normalized_path, transcript))] + return files