diff --git a/common/djangoapps/search/__init__.py b/common/djangoapps/search/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/common/djangoapps/search/analyzer.json b/common/djangoapps/search/analyzer.json deleted file mode 100644 index 2221d7ad19..0000000000 --- a/common/djangoapps/search/analyzer.json +++ /dev/null @@ -1,36 +0,0 @@ -{ -"analyzer": { - - "transcript_analyzer": { - "type": "custom", - "tokenizer": "standard", - "filter": ["protected", "asciifolding", "custom_word_delimiter", "lowercase", "custom_stemmer", "shingle"], - "char_filter": ["custom_mapping"] - } -}, - -"filter" : { - - "custom_word_delimiter":{ - "type": "word_delimiter", - "preserve_original": "true" - }, - - "custom_stemmer": { - "type": "stemmer", - "name": "english" - }, - - "protected": { - "type": "keyword_marker", - "keywords_path": "protectedWords.txt" - } -}, - -"char_filter": { - "custom_mapping": { - "type": "mapping", - "mappings": ["\n=>-"] - } -} -} \ No newline at end of file diff --git a/common/djangoapps/search/es_requests.py b/common/djangoapps/search/es_requests.py deleted file mode 100644 index 7c4a81faf5..0000000000 --- a/common/djangoapps/search/es_requests.py +++ /dev/null @@ -1,104 +0,0 @@ -import requests -import json - - -class ElasticDatabase: - - def __init__(self, url, index_settings_file, *args): - """ - Will initialize elastic search object with any indices specified by args - - specifically the url should be something of the form `http://localhost:9200` - importantly do not include a slash at the end of the url name. - - args should be a list of dictionaries, each dictionary specifying a JSON mapping - to be used for a specific type. - - Example Dictionary: - {"index": "transcript", "type": "6-002x", "mapping": - { - "properties" : { - "searchable_text": { - "type": "string", - "store": "yes", - "index": "analyzed" - } - } - } - } - - Eventually we will support different configuration files for different indices, but - since this is only indexing transcripts right now it seems excessive""" - - self.url = url - self.args = args - self.index_settings = open(index_settings_file, 'rb').read() - - def parse_args(self): - for mapping in self.args: - try: - json_mapping = json.loads(mapping) - except ValueError: - print "Badly formed JSON args, please check your mappings file" - break - - try: - index = json_mapping['index'] - type_ = json_mapping['type'] - mapping = json_mapping['mapping'] - self.setup_index(index) - self.setup_type(index, type_, mapping) - except KeyError: - print "Could not find needed keys. Keys found: " - print mapping.keys() - continue - - def setup_type(self, index, type_, json_mapping): - """ - json_mapping should be a dictionary starting at the properties level of a mapping. - - The type level will be added, so if you include it things will break. The purpose of this - is to encourage loose coupling between types and mappings for better code - """ - - full_url = "/".join([self.url, index, type_, "_mapping"]) - json_put_body = {type_: json_mapping} - requests.put(full_url, data=json_put_body) - - def has_index(self, index): - """Checks to see if a given index exists in the database returns existance boolean, - - If this returns something other than a 200 or a 404 something is wrong and so we error""" - full_url = "/".join([self.url, index]) - status = requests.head(full_url).status_code - if status == 200: - return True - if status == 404: - return False - else: - print "Got an unexpected reponse code: " + str(status) - raise - - def setup_index(self, index): - """Creates a new elasticsearch index, returns the response it gets""" - full_url = "/".join(self.url, index) + "/" - return requests.put(full_url, data=self.index_settings) - - def index_data(self, index, type_, id_, data): - """Data should be passed in as a dictionary, assumes it matches the given mapping""" - full_url = "/".join([self.url, index, type_, id_]) - response = requests.put(full_url, json.dumps(data)) - return json.loads(response)['ok'] - - def get_index_settings(self, index): - """Returns the current settings of """ - full_url = "/".join([self.url, index, "_settings"]) - return json.loads(requests.get(full_url)._content) - - def get_type_mapping(self, index, type_): - full_url = "/".join([self.url, index, type_, "_mapping"]) - return json.loads(requests.get(full_url)._content) - - def index_data(self, index, type_, id_, json_data): - full_url = "/".join([self.url, index, type_, id_]) - requests.put(full_url, data=json_data) diff --git a/common/djangoapps/search/index.py b/common/djangoapps/search/index.py deleted file mode 100644 index 7012def8cb..0000000000 --- a/common/djangoapps/search/index.py +++ /dev/null @@ -1,82 +0,0 @@ -import os -import os.path as pt -import json -import re -import string - -from pyes import * -import nltk.stem.snowball as snowball -import fuzzy - - -def grab_transcripts(sjson_directory): - """Returns referenes to all of the files contained within a subs directory""" - all_children = [child for child in os.listdir(sjson_directory)] - all_transcripts = [child for child in all_children if pt.isfile(pt.join(sjson_directory, child))] - # . is not a valid character for a youtube id, so it can be reliably used to pick up the start - # of the file extension - uuids = [transcript_id[:transcript_id.find(".")] for transcript_id in all_transcripts] - parsed_transcripts = [open(pt.join(sjson_directory, transcript)).read() for transcript in all_transcripts] - return zip([clean_transcript(transcript) for transcript in parsed_transcripts], uuids) - - -def clean_transcript(transcript_string): - """Tries to parse and clean a raw transcript. Errors for invalid sjson""" - transcript_list = filter(None, json.loads(transcript_string)['text']) - relevant_text = " ".join([phrase.encode('utf-8').strip() for phrase in transcript_list]) - relevant_text = relevant_text.lower().translate(None, string.punctuation) - cleanedText = re.sub('\n', " ", relevant_text) - return cleanedText - - -def phonetic_transcript(clean_transcript, stemmer): - return " ".join([phoneticize(word, stemmer) for word in clean_transcript.split(" ")]) - - -def phoneticize(word, stemmer): - encode = lambda word: word.decode('utf-8').encode('ascii', 'ignore') - phonetic = lambda word: fuzzy.nysiis(stemmer.stem(encode(word))) - return phonetic(word) - - -def initialize_transcripts(database, mapping): - database.indices.create_index("transcript-index") - - -def index_course(database, sjson_directory, course_name, mapping): - stemmer = snowball.EnglishStemmer() - database.put_mapping(course_name, {'properties': mapping}, "transcript-index") - all_transcripts = grab_transcripts(sjson_directory) - video_counter = 0 - for transcript_tuple in all_transcripts: - data_map = {"searchable_text": transcript_tuple[0], "uuid": transcript_tuple[1]} - data_map['phonetic_text'] = phonetic_transcript(transcript_tuple[0], stemmer) - database.index(data_map, "transcript-index", course_name) - video_counter += 1 - database.indices.refresh("transcript-index") - - -def fuzzy_search(database, query, course_name): - search_query = FuzzyLikeThisFieldQuery("searchable_text", query) - return database.search(query=search_query, indices="transcript-index") - - -def phonetic_search(database, query, course_name): - stemmer = snowball.EnglishStemmer() - search_query = TextQuery("phonetic_text", phoneticize(query, stemmer)) - return database.search(query=search_query, indices="transcript-index") - - -data_directory = '/Users/climatologist/edx_all/data/content-mit-6002x/static/subs/' -mapping_directory = 'mapping.json' -database = ES('127.0.0.1:9200') -mapping = json.loads(open(mapping_directory, 'rb').read()) - -#initialize_transcripts(database, mapping) -#index_course(database, data_directory, "test-course", mapping) -fuzzy_results = fuzzy_search(database, "gaussian", "test-course") -phonetic_results = phonetic_search(database, "gaussian", "test-course") -for r in fuzzy_results: - print "Fuzzy: " + r['uuid'] -for r in phonetic_results: - print "Phonetic: " + r['uuid'] diff --git a/common/djangoapps/search/mapping.json b/common/djangoapps/search/mapping.json deleted file mode 100644 index 8c4ac045ac..0000000000 --- a/common/djangoapps/search/mapping.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - - "searchable_text": { - "boost": 1.0, - "index": "analyzed", - "store": "yes", - "type": "string", - "term_vector": "with_positions_offsets", - "analyzer": "transcript_analyzer" - } -} \ No newline at end of file diff --git a/common/djangoapps/search/protectedWords.txt b/common/djangoapps/search/protectedWords.txt deleted file mode 100644 index 5f3fce5203..0000000000 --- a/common/djangoapps/search/protectedWords.txt +++ /dev/null @@ -1,15 +0,0 @@ -"gauss", -"stokes", -"navier", -"einstein", -"goddard", -"oppenheimer", -"bloch", -"hawkings", -"newton", -"bohr", -"darwin", -"planck", -"rontgen", -"tesla", -"franklin" \ No newline at end of file diff --git a/common/djangoapps/search/settings.json b/common/djangoapps/search/settings.json deleted file mode 100644 index 929ec092c9..0000000000 --- a/common/djangoapps/search/settings.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "settings": { - "index": { - "number_of_replicas": 2, - "number_of_shards": 3 - } - } -} \ No newline at end of file diff --git a/common/djangoapps/search/views.py b/common/djangoapps/search/views.py deleted file mode 100644 index f15dc9266b..0000000000 --- a/common/djangoapps/search/views.py +++ /dev/null @@ -1,83 +0,0 @@ -from django.http import HttpResponse -from django.template.loader import get_template -from django.template import Context -from django.contrib.auth.models import User -from django.contrib.staticfiles import finders -from courseware.courses import get_courses -from courseware.model_data import ModelDataCache -from courseware.module_render import get_module_for_descriptor - -from courseware.views import registered_for_course -#import logging -import lxml -import re -import posixpath -import urllib -from os import listdir -from os.path import isfile -from os.path import join - - -def test(request): - user = User.objects.prefetch_related("groups").get(id=request.user.id) - request.user = user - - course_list = get_courses(user, request.META.get('HTTP_HOST')) - - all_modules = [get_module(request, user, course) for course in course_list if registered_for_course(course, user)] - child_modules = [] - for module in all_modules: - child_modules.extend(module.get_children()) - bottom_modules = [] - for module in child_modules: - bottom_modules.extend(module.get_children()) - asset_divs = get_asset_div(convert_to_valid_html(bottom_modules[2].get_html())) - strings = [get_transcript_directory(lxml.html.tostring(div)) for div in asset_divs] - search_template = get_template('search.html') - html = search_template.render(Context({'course_list': strings})) - return HttpResponse(html) - - -def get_children(course): - """Returns the children of a given course""" - attributes = [child.location for child in course._child_instances] - return attributes - - -def convert_to_valid_html(html): - replacement = {"<": "<", ">": ">", """: "\"", "'": "'"} - for i, j in replacement.iteritems(): - html = html.replace(i, j) - return html - - -def get_asset_div(html_page): - return lxml.html.find_class(html_page, "video") - - -def get_module(request, user, course): - model_data_cache = ModelDataCache.cache_for_descriptor_descendents(course.id, user, course, depth=2) - course_module = get_module_for_descriptor(user, request, course, model_data_cache, course.id) - return course_module - - -def get_youtube_code(module_html): - youtube_snippet = re.sub(r'(.*?)(1\.0:)(.*?)(,1\.25)(.*)', r'\3', module_html) - sliced_youtube_code = youtube_snippet[:youtube_snippet.find('\n')] - return sliced_youtube_code - - -def get_transcript_directory(module_html): - directory_snippet = re.sub(r'(.*?)(data-caption-asset-path=\")(.*?)(\">.*)', r'\3', module_html) - sliced_directory = directory_snippet[:directory_snippet.find('\n')] - return resolve_to_absolute_path(sliced_directory) - - -def resolve_to_absolute_path(transcript_directory): - normalized_path = posixpath.normpath(urllib.unquote(transcript_directory)).lstrip('/') - return all_transcript_files(normalized_path) - - -def all_transcript_files(normalized_path): - files = [transcript for transcript in listdir(normalized_path) if isfile(join(normalized_path, transcript))] - return files