From 3b165da17047b13c696ee319fa8798d20f8cff04 Mon Sep 17 00:00:00 2001 From: Slater-Victoroff Date: Thu, 6 Jun 2013 08:56:16 -0400 Subject: [PATCH] Replaced troublesome pyes integration with direct calls made to elasticsearch rest api --- common/djangoapps/search/analyzer.json | 36 +++++++ common/djangoapps/search/es_requests.py | 104 ++++++++++++++++++++ common/djangoapps/search/mapping.json | 17 +--- common/djangoapps/search/protectedWords.txt | 15 +++ common/djangoapps/search/settings.json | 8 ++ 5 files changed, 165 insertions(+), 15 deletions(-) create mode 100644 common/djangoapps/search/analyzer.json create mode 100644 common/djangoapps/search/es_requests.py create mode 100644 common/djangoapps/search/protectedWords.txt create mode 100644 common/djangoapps/search/settings.json diff --git a/common/djangoapps/search/analyzer.json b/common/djangoapps/search/analyzer.json new file mode 100644 index 0000000000..2221d7ad19 --- /dev/null +++ b/common/djangoapps/search/analyzer.json @@ -0,0 +1,36 @@ +{ +"analyzer": { + + "transcript_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["protected", "asciifolding", "custom_word_delimiter", "lowercase", "custom_stemmer", "shingle"], + "char_filter": ["custom_mapping"] + } +}, + +"filter" : { + + "custom_word_delimiter":{ + "type": "word_delimiter", + "preserve_original": "true" + }, + + "custom_stemmer": { + "type": "stemmer", + "name": "english" + }, + + "protected": { + "type": "keyword_marker", + "keywords_path": "protectedWords.txt" + } +}, + +"char_filter": { + "custom_mapping": { + "type": "mapping", + "mappings": ["\n=>-"] + } +} +} \ No newline at end of file diff --git a/common/djangoapps/search/es_requests.py b/common/djangoapps/search/es_requests.py new file mode 100644 index 0000000000..7c4a81faf5 --- /dev/null +++ b/common/djangoapps/search/es_requests.py @@ -0,0 +1,104 @@ +import requests +import json + + +class ElasticDatabase: + + def __init__(self, url, index_settings_file, *args): + """ + Will initialize elastic search object with any indices specified by args + + specifically the url should be something of the form `http://localhost:9200` + importantly do not include a slash at the end of the url name. + + args should be a list of dictionaries, each dictionary specifying a JSON mapping + to be used for a specific type. + + Example Dictionary: + {"index": "transcript", "type": "6-002x", "mapping": + { + "properties" : { + "searchable_text": { + "type": "string", + "store": "yes", + "index": "analyzed" + } + } + } + } + + Eventually we will support different configuration files for different indices, but + since this is only indexing transcripts right now it seems excessive""" + + self.url = url + self.args = args + self.index_settings = open(index_settings_file, 'rb').read() + + def parse_args(self): + for mapping in self.args: + try: + json_mapping = json.loads(mapping) + except ValueError: + print "Badly formed JSON args, please check your mappings file" + break + + try: + index = json_mapping['index'] + type_ = json_mapping['type'] + mapping = json_mapping['mapping'] + self.setup_index(index) + self.setup_type(index, type_, mapping) + except KeyError: + print "Could not find needed keys. Keys found: " + print mapping.keys() + continue + + def setup_type(self, index, type_, json_mapping): + """ + json_mapping should be a dictionary starting at the properties level of a mapping. + + The type level will be added, so if you include it things will break. The purpose of this + is to encourage loose coupling between types and mappings for better code + """ + + full_url = "/".join([self.url, index, type_, "_mapping"]) + json_put_body = {type_: json_mapping} + requests.put(full_url, data=json_put_body) + + def has_index(self, index): + """Checks to see if a given index exists in the database returns existance boolean, + + If this returns something other than a 200 or a 404 something is wrong and so we error""" + full_url = "/".join([self.url, index]) + status = requests.head(full_url).status_code + if status == 200: + return True + if status == 404: + return False + else: + print "Got an unexpected reponse code: " + str(status) + raise + + def setup_index(self, index): + """Creates a new elasticsearch index, returns the response it gets""" + full_url = "/".join(self.url, index) + "/" + return requests.put(full_url, data=self.index_settings) + + def index_data(self, index, type_, id_, data): + """Data should be passed in as a dictionary, assumes it matches the given mapping""" + full_url = "/".join([self.url, index, type_, id_]) + response = requests.put(full_url, json.dumps(data)) + return json.loads(response)['ok'] + + def get_index_settings(self, index): + """Returns the current settings of """ + full_url = "/".join([self.url, index, "_settings"]) + return json.loads(requests.get(full_url)._content) + + def get_type_mapping(self, index, type_): + full_url = "/".join([self.url, index, type_, "_mapping"]) + return json.loads(requests.get(full_url)._content) + + def index_data(self, index, type_, id_, json_data): + full_url = "/".join([self.url, index, type_, id_]) + requests.put(full_url, data=json_data) diff --git a/common/djangoapps/search/mapping.json b/common/djangoapps/search/mapping.json index 8d4deade6d..8c4ac045ac 100644 --- a/common/djangoapps/search/mapping.json +++ b/common/djangoapps/search/mapping.json @@ -5,20 +5,7 @@ "index": "analyzed", "store": "yes", "type": "string", - "term_vector": "with_positions_offsets" - }, - - "phonetic_text": { - "boost": 1.0, - "index": "analyzed", - "store": "yes", - "type": "string", - "term_vector": "with_positions_offsets" - }, - - "uuid": { - "index": "not_analyzed", - "store": "yes", - "type": "string" + "term_vector": "with_positions_offsets", + "analyzer": "transcript_analyzer" } } \ No newline at end of file diff --git a/common/djangoapps/search/protectedWords.txt b/common/djangoapps/search/protectedWords.txt new file mode 100644 index 0000000000..5f3fce5203 --- /dev/null +++ b/common/djangoapps/search/protectedWords.txt @@ -0,0 +1,15 @@ +"gauss", +"stokes", +"navier", +"einstein", +"goddard", +"oppenheimer", +"bloch", +"hawkings", +"newton", +"bohr", +"darwin", +"planck", +"rontgen", +"tesla", +"franklin" \ No newline at end of file diff --git a/common/djangoapps/search/settings.json b/common/djangoapps/search/settings.json new file mode 100644 index 0000000000..929ec092c9 --- /dev/null +++ b/common/djangoapps/search/settings.json @@ -0,0 +1,8 @@ +{ + "settings": { + "index": { + "number_of_replicas": 2, + "number_of_shards": 3 + } + } +} \ No newline at end of file