diff options
author | Tomeu Vizoso <tomeu@tomeuvizoso.net> | 2008-08-22 13:33:04 (GMT) |
---|---|---|
committer | Tomeu Vizoso <tomeu@tomeuvizoso.net> | 2008-08-22 13:33:04 (GMT) |
commit | b740fc38b11e6b7a0699ef97987889ffee47d3f2 (patch) | |
tree | 47845d4388db2931778e6ae9c1cdb894fa156bc2 /src | |
parent | 43d44783dbe2e41fb764b45dc24019dcc49c8c4d (diff) |
Index text and implement full text search
Diffstat (limited to 'src')
-rw-r--r-- | src/olpc/datastore/indexstore.py | 51 |
1 files changed, 50 insertions, 1 deletions
diff --git a/src/olpc/datastore/indexstore.py b/src/olpc/datastore/indexstore.py index aece2d1..0adf9d4 100644 --- a/src/olpc/datastore/indexstore.py +++ b/src/olpc/datastore/indexstore.py @@ -1,7 +1,8 @@ import os +import logging import xapian -from xapian import WritableDatabase, Document, Enquire, Query +from xapian import WritableDatabase, Document, Enquire, Query, QueryParser _MAX_LIMIT = 4096 @@ -12,6 +13,8 @@ _VALUE_MIME_TYPE = 3 _VALUE_ACTIVITY = 4 _VALUE_KEEP = 5 +_PROPERTIES_NOT_TO_INDEX = ['timestamp', 'activity_id', 'keep', 'preview'] + class IndexStore(object): def __init__(self, root_path): index_path = os.path.join(root_path, 'index') @@ -34,12 +37,38 @@ class IndexStore(object): document.add_value(_VALUE_MIME_TYPE, str(properties['keep'])) document.add_value(_VALUE_ACTIVITY, properties['activity']) + term_generator = xapian.TermGenerator() + + # TODO: we should do stemming, but in which language? + #if language is not None: + # term_generator.set_stemmer(_xapian.Stem(language)) + + # TODO: we should use a stopper + #if stop is not None: + # stopper = _xapian.SimpleStopper() + # for term in stop: + # stopper.add (term) + # term_generator.set_stopper (stopper) + + term_generator.set_document(document) + term_generator.index_text_without_positions( + self._extract_text(properties), 1, '') + if not self._document_exists(uid): self._database.add_document(document) else: self._database.replace_document('Q' + uid, document) self._database.flush() + def _extract_text(self, properties): + text = '' + for key, value in properties.items(): + if key not in _PROPERTIES_NOT_TO_INDEX: + if text: + text += ' ' + text += value + return text + def find(self, query): enquire = Enquire(self._database) enquire.set_query(self._parse_query(query)) @@ -62,8 +91,28 @@ class IndexStore(object): return (uids, total_count) def _parse_query(self, query_dict): + logging.debug('_parse_query %r' % query_dict) queries = [] + if query_dict.has_key('query'): + query_parser = QueryParser() + query_parser.set_database(self._database) + #query_parser.set_default_op(Query.OP_AND) + + # TODO: we should do stemming, but in which language? + #query_parser.set_stemmer(_xapian.Stem(lang)) + #query_parser.set_stemming_strategy(qp.STEM_SOME) + + query = query_parser.parse_query( + query_dict['query'], + QueryParser.FLAG_PHRASE | + QueryParser.FLAG_BOOLEAN | + QueryParser.FLAG_LOVEHATE | + QueryParser.FLAG_WILDCARD, + '') + + queries.append(query) + if query_dict.has_key('uid'): queries.append(Query('Q' + query_dict['uid'])) |