Merge pull request #88 from edx/zoldak/revert-search-push
Revert accidental push of changes to master
This commit is contained in:
@@ -1,36 +0,0 @@
|
||||
{
|
||||
"analyzer": {
|
||||
|
||||
"transcript_analyzer": {
|
||||
"type": "custom",
|
||||
"tokenizer": "standard",
|
||||
"filter": ["protected", "asciifolding", "custom_word_delimiter", "lowercase", "custom_stemmer", "shingle"],
|
||||
"char_filter": ["custom_mapping"]
|
||||
}
|
||||
},
|
||||
|
||||
"filter" : {
|
||||
|
||||
"custom_word_delimiter":{
|
||||
"type": "word_delimiter",
|
||||
"preserve_original": "true"
|
||||
},
|
||||
|
||||
"custom_stemmer": {
|
||||
"type": "stemmer",
|
||||
"name": "english"
|
||||
},
|
||||
|
||||
"protected": {
|
||||
"type": "keyword_marker",
|
||||
"keywords_path": "protectedWords.txt"
|
||||
}
|
||||
},
|
||||
|
||||
"char_filter": {
|
||||
"custom_mapping": {
|
||||
"type": "mapping",
|
||||
"mappings": ["\n=>-"]
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,104 +0,0 @@
|
||||
import requests
|
||||
import json
|
||||
|
||||
|
||||
class ElasticDatabase:
|
||||
|
||||
def __init__(self, url, index_settings_file, *args):
|
||||
"""
|
||||
Will initialize elastic search object with any indices specified by args
|
||||
|
||||
specifically the url should be something of the form `http://localhost:9200`
|
||||
importantly do not include a slash at the end of the url name.
|
||||
|
||||
args should be a list of dictionaries, each dictionary specifying a JSON mapping
|
||||
to be used for a specific type.
|
||||
|
||||
Example Dictionary:
|
||||
{"index": "transcript", "type": "6-002x", "mapping":
|
||||
{
|
||||
"properties" : {
|
||||
"searchable_text": {
|
||||
"type": "string",
|
||||
"store": "yes",
|
||||
"index": "analyzed"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Eventually we will support different configuration files for different indices, but
|
||||
since this is only indexing transcripts right now it seems excessive"""
|
||||
|
||||
self.url = url
|
||||
self.args = args
|
||||
self.index_settings = open(index_settings_file, 'rb').read()
|
||||
|
||||
def parse_args(self):
|
||||
for mapping in self.args:
|
||||
try:
|
||||
json_mapping = json.loads(mapping)
|
||||
except ValueError:
|
||||
print "Badly formed JSON args, please check your mappings file"
|
||||
break
|
||||
|
||||
try:
|
||||
index = json_mapping['index']
|
||||
type_ = json_mapping['type']
|
||||
mapping = json_mapping['mapping']
|
||||
self.setup_index(index)
|
||||
self.setup_type(index, type_, mapping)
|
||||
except KeyError:
|
||||
print "Could not find needed keys. Keys found: "
|
||||
print mapping.keys()
|
||||
continue
|
||||
|
||||
def setup_type(self, index, type_, json_mapping):
|
||||
"""
|
||||
json_mapping should be a dictionary starting at the properties level of a mapping.
|
||||
|
||||
The type level will be added, so if you include it things will break. The purpose of this
|
||||
is to encourage loose coupling between types and mappings for better code
|
||||
"""
|
||||
|
||||
full_url = "/".join([self.url, index, type_, "_mapping"])
|
||||
json_put_body = {type_: json_mapping}
|
||||
requests.put(full_url, data=json_put_body)
|
||||
|
||||
def has_index(self, index):
|
||||
"""Checks to see if a given index exists in the database returns existance boolean,
|
||||
|
||||
If this returns something other than a 200 or a 404 something is wrong and so we error"""
|
||||
full_url = "/".join([self.url, index])
|
||||
status = requests.head(full_url).status_code
|
||||
if status == 200:
|
||||
return True
|
||||
if status == 404:
|
||||
return False
|
||||
else:
|
||||
print "Got an unexpected reponse code: " + str(status)
|
||||
raise
|
||||
|
||||
def setup_index(self, index):
|
||||
"""Creates a new elasticsearch index, returns the response it gets"""
|
||||
full_url = "/".join(self.url, index) + "/"
|
||||
return requests.put(full_url, data=self.index_settings)
|
||||
|
||||
def index_data(self, index, type_, id_, data):
|
||||
"""Data should be passed in as a dictionary, assumes it matches the given mapping"""
|
||||
full_url = "/".join([self.url, index, type_, id_])
|
||||
response = requests.put(full_url, json.dumps(data))
|
||||
return json.loads(response)['ok']
|
||||
|
||||
def get_index_settings(self, index):
|
||||
"""Returns the current settings of """
|
||||
full_url = "/".join([self.url, index, "_settings"])
|
||||
return json.loads(requests.get(full_url)._content)
|
||||
|
||||
def get_type_mapping(self, index, type_):
|
||||
full_url = "/".join([self.url, index, type_, "_mapping"])
|
||||
return json.loads(requests.get(full_url)._content)
|
||||
|
||||
def index_data(self, index, type_, id_, json_data):
|
||||
full_url = "/".join([self.url, index, type_, id_])
|
||||
requests.put(full_url, data=json_data)
|
||||
@@ -1,82 +0,0 @@
|
||||
import os
|
||||
import os.path as pt
|
||||
import json
|
||||
import re
|
||||
import string
|
||||
|
||||
from pyes import *
|
||||
import nltk.stem.snowball as snowball
|
||||
import fuzzy
|
||||
|
||||
|
||||
def grab_transcripts(sjson_directory):
|
||||
"""Returns referenes to all of the files contained within a subs directory"""
|
||||
all_children = [child for child in os.listdir(sjson_directory)]
|
||||
all_transcripts = [child for child in all_children if pt.isfile(pt.join(sjson_directory, child))]
|
||||
# . is not a valid character for a youtube id, so it can be reliably used to pick up the start
|
||||
# of the file extension
|
||||
uuids = [transcript_id[:transcript_id.find(".")] for transcript_id in all_transcripts]
|
||||
parsed_transcripts = [open(pt.join(sjson_directory, transcript)).read() for transcript in all_transcripts]
|
||||
return zip([clean_transcript(transcript) for transcript in parsed_transcripts], uuids)
|
||||
|
||||
|
||||
def clean_transcript(transcript_string):
|
||||
"""Tries to parse and clean a raw transcript. Errors for invalid sjson"""
|
||||
transcript_list = filter(None, json.loads(transcript_string)['text'])
|
||||
relevant_text = " ".join([phrase.encode('utf-8').strip() for phrase in transcript_list])
|
||||
relevant_text = relevant_text.lower().translate(None, string.punctuation)
|
||||
cleanedText = re.sub('\n', " ", relevant_text)
|
||||
return cleanedText
|
||||
|
||||
|
||||
def phonetic_transcript(clean_transcript, stemmer):
|
||||
return " ".join([phoneticize(word, stemmer) for word in clean_transcript.split(" ")])
|
||||
|
||||
|
||||
def phoneticize(word, stemmer):
|
||||
encode = lambda word: word.decode('utf-8').encode('ascii', 'ignore')
|
||||
phonetic = lambda word: fuzzy.nysiis(stemmer.stem(encode(word)))
|
||||
return phonetic(word)
|
||||
|
||||
|
||||
def initialize_transcripts(database, mapping):
|
||||
database.indices.create_index("transcript-index")
|
||||
|
||||
|
||||
def index_course(database, sjson_directory, course_name, mapping):
|
||||
stemmer = snowball.EnglishStemmer()
|
||||
database.put_mapping(course_name, {'properties': mapping}, "transcript-index")
|
||||
all_transcripts = grab_transcripts(sjson_directory)
|
||||
video_counter = 0
|
||||
for transcript_tuple in all_transcripts:
|
||||
data_map = {"searchable_text": transcript_tuple[0], "uuid": transcript_tuple[1]}
|
||||
data_map['phonetic_text'] = phonetic_transcript(transcript_tuple[0], stemmer)
|
||||
database.index(data_map, "transcript-index", course_name)
|
||||
video_counter += 1
|
||||
database.indices.refresh("transcript-index")
|
||||
|
||||
|
||||
def fuzzy_search(database, query, course_name):
|
||||
search_query = FuzzyLikeThisFieldQuery("searchable_text", query)
|
||||
return database.search(query=search_query, indices="transcript-index")
|
||||
|
||||
|
||||
def phonetic_search(database, query, course_name):
|
||||
stemmer = snowball.EnglishStemmer()
|
||||
search_query = TextQuery("phonetic_text", phoneticize(query, stemmer))
|
||||
return database.search(query=search_query, indices="transcript-index")
|
||||
|
||||
|
||||
data_directory = '/Users/climatologist/edx_all/data/content-mit-6002x/static/subs/'
|
||||
mapping_directory = 'mapping.json'
|
||||
database = ES('127.0.0.1:9200')
|
||||
mapping = json.loads(open(mapping_directory, 'rb').read())
|
||||
|
||||
#initialize_transcripts(database, mapping)
|
||||
#index_course(database, data_directory, "test-course", mapping)
|
||||
fuzzy_results = fuzzy_search(database, "gaussian", "test-course")
|
||||
phonetic_results = phonetic_search(database, "gaussian", "test-course")
|
||||
for r in fuzzy_results:
|
||||
print "Fuzzy: " + r['uuid']
|
||||
for r in phonetic_results:
|
||||
print "Phonetic: " + r['uuid']
|
||||
@@ -1,11 +0,0 @@
|
||||
{
|
||||
|
||||
"searchable_text": {
|
||||
"boost": 1.0,
|
||||
"index": "analyzed",
|
||||
"store": "yes",
|
||||
"type": "string",
|
||||
"term_vector": "with_positions_offsets",
|
||||
"analyzer": "transcript_analyzer"
|
||||
}
|
||||
}
|
||||
@@ -1,15 +0,0 @@
|
||||
"gauss",
|
||||
"stokes",
|
||||
"navier",
|
||||
"einstein",
|
||||
"goddard",
|
||||
"oppenheimer",
|
||||
"bloch",
|
||||
"hawkings",
|
||||
"newton",
|
||||
"bohr",
|
||||
"darwin",
|
||||
"planck",
|
||||
"rontgen",
|
||||
"tesla",
|
||||
"franklin"
|
||||
@@ -1,8 +0,0 @@
|
||||
{
|
||||
"settings": {
|
||||
"index": {
|
||||
"number_of_replicas": 2,
|
||||
"number_of_shards": 3
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,83 +0,0 @@
|
||||
from django.http import HttpResponse
|
||||
from django.template.loader import get_template
|
||||
from django.template import Context
|
||||
from django.contrib.auth.models import User
|
||||
from django.contrib.staticfiles import finders
|
||||
from courseware.courses import get_courses
|
||||
from courseware.model_data import ModelDataCache
|
||||
from courseware.module_render import get_module_for_descriptor
|
||||
|
||||
from courseware.views import registered_for_course
|
||||
#import logging
|
||||
import lxml
|
||||
import re
|
||||
import posixpath
|
||||
import urllib
|
||||
from os import listdir
|
||||
from os.path import isfile
|
||||
from os.path import join
|
||||
|
||||
|
||||
def test(request):
|
||||
user = User.objects.prefetch_related("groups").get(id=request.user.id)
|
||||
request.user = user
|
||||
|
||||
course_list = get_courses(user, request.META.get('HTTP_HOST'))
|
||||
|
||||
all_modules = [get_module(request, user, course) for course in course_list if registered_for_course(course, user)]
|
||||
child_modules = []
|
||||
for module in all_modules:
|
||||
child_modules.extend(module.get_children())
|
||||
bottom_modules = []
|
||||
for module in child_modules:
|
||||
bottom_modules.extend(module.get_children())
|
||||
asset_divs = get_asset_div(convert_to_valid_html(bottom_modules[2].get_html()))
|
||||
strings = [get_transcript_directory(lxml.html.tostring(div)) for div in asset_divs]
|
||||
search_template = get_template('search.html')
|
||||
html = search_template.render(Context({'course_list': strings}))
|
||||
return HttpResponse(html)
|
||||
|
||||
|
||||
def get_children(course):
|
||||
"""Returns the children of a given course"""
|
||||
attributes = [child.location for child in course._child_instances]
|
||||
return attributes
|
||||
|
||||
|
||||
def convert_to_valid_html(html):
|
||||
replacement = {"<": "<", ">": ">", """: "\"", "'": "'"}
|
||||
for i, j in replacement.iteritems():
|
||||
html = html.replace(i, j)
|
||||
return html
|
||||
|
||||
|
||||
def get_asset_div(html_page):
|
||||
return lxml.html.find_class(html_page, "video")
|
||||
|
||||
|
||||
def get_module(request, user, course):
|
||||
model_data_cache = ModelDataCache.cache_for_descriptor_descendents(course.id, user, course, depth=2)
|
||||
course_module = get_module_for_descriptor(user, request, course, model_data_cache, course.id)
|
||||
return course_module
|
||||
|
||||
|
||||
def get_youtube_code(module_html):
|
||||
youtube_snippet = re.sub(r'(.*?)(1\.0:)(.*?)(,1\.25)(.*)', r'\3', module_html)
|
||||
sliced_youtube_code = youtube_snippet[:youtube_snippet.find('\n')]
|
||||
return sliced_youtube_code
|
||||
|
||||
|
||||
def get_transcript_directory(module_html):
|
||||
directory_snippet = re.sub(r'(.*?)(data-caption-asset-path=\")(.*?)(\">.*)', r'\3', module_html)
|
||||
sliced_directory = directory_snippet[:directory_snippet.find('\n')]
|
||||
return resolve_to_absolute_path(sliced_directory)
|
||||
|
||||
|
||||
def resolve_to_absolute_path(transcript_directory):
|
||||
normalized_path = posixpath.normpath(urllib.unquote(transcript_directory)).lstrip('/')
|
||||
return all_transcript_files(normalized_path)
|
||||
|
||||
|
||||
def all_transcript_files(normalized_path):
|
||||
files = [transcript for transcript in listdir(normalized_path) if isfile(join(normalized_path, transcript))]
|
||||
return files
|
||||
Reference in New Issue
Block a user