diff options
author | Sascha Silbe <sascha@silbe.org> | 2009-08-12 19:55:26 (GMT) |
---|---|---|
committer | Sascha Silbe <sascha@silbe.org> | 2009-08-12 19:55:26 (GMT) |
commit | 137dc9075a7ffdc991c1eecc245d41536234e32b (patch) | |
tree | e3c25da5cc75eb6cc0dcf179beecf3ac6e343ca0 | |
parent | e176669bbb7d91d2975f75816a56b61e8c8e16b9 (diff) |
revert refactoring (=> only test suite)onlytest
-rw-r--r-- | src/carquinyol/indexstore.py | 277 |
1 files changed, 107 insertions, 170 deletions
diff --git a/src/carquinyol/indexstore.py b/src/carquinyol/indexstore.py index 45f09a9..42c3132 100644 --- a/src/carquinyol/indexstore.py +++ b/src/carquinyol/indexstore.py @@ -16,12 +16,10 @@ import logging import os -import sys -import time import gobject import xapian -from xapian import WritableDatabase, Document, Enquire, Query +from xapian import WritableDatabase, Document, Enquire, Query, QueryParser from carquinyol import layoutmanager from carquinyol.layoutmanager import MAX_QUERY_LIMIT @@ -42,165 +40,10 @@ _FLUSH_THRESHOLD = 20 # Force a flush after _n_ seconds since the last change to the db _FLUSH_TIMEOUT = 60 -_PROPERTIES_NOT_TO_INDEX = ['timestamp', 'preview'] +_PROPERTIES_NOT_TO_INDEX = ['timestamp', 'activity_id', 'keep', 'preview'] _MAX_RESULTS = int(2 ** 31 - 1) -_QUERY_TERM_MAP = { - 'uid': _PREFIX_UID, - 'activity': _PREFIX_ACTIVITY, - 'activity_id': _PREFIX_ACTIVITY_ID, - 'mime_type': _PREFIX_MIME_TYPE, - 'keep': _PREFIX_KEEP, -} - -_QUERY_VALUE_MAP = { - 'timestamp': _VALUE_TIMESTAMP, -} - - -class TermGenerator (xapian.TermGenerator): - - def index_document(self, document, properties): - document.add_value(_VALUE_TIMESTAMP, str(properties['timestamp'])) - document.add_value(_VALUE_TITLE, properties.get('title', '').strip()) - - xapian.TermGenerator.set_document(self, document) - - properties = dict(properties) - self._index_known(document, properties) - self._index_unknown(document, properties) - - def _index_known(self, document, properties): - for name, prefix in _QUERY_TERM_MAP.items(): - if (name not in properties): - continue - - self._index_property(document, name, properties.pop(name), prefix) - - def _index_unknown(self, document, properties): - for name, value in properties.items(): - self._index_property(document, name, value) - - def _index_property(self, doc, name, value, prefix=''): - if name in _PROPERTIES_NOT_TO_INDEX or not value: - return - - if isinstance(value, unicode): - value = value.encode('utf-8') - elif not isinstance(value, basestring): - value = str(value) - - # We need to add the full value (i.e. not split into words) so - # dictionary-based queries work (they don't split the value - # into words). - # TODO: change query parser to generate phrase query instead - doc.add_term(prefix + value) - - # We need to index both with and without prefix because Xapian - # only matches against non-prefix terms if no prefix is given - # inside the query. - if prefix: - self.index_text(value, 1, prefix) - - self.index_text(value) - self.increase_termpos() - - -class QueryParser (xapian.QueryParser): - """QueryParser that understands dictionaries and Xapian query strings. - - The dictionary contains metadata names as keys and either basic types - (exact match), 2-tuples (range, only valid for value-stored metadata) - or a list (multiple exact matches joined with OR) as values. - An empty dictionary matches everything. Queries from different keys - (i.e. different metadata names) are joined with AND. - """ - - def __init__(self): - xapian.QueryParser.__init__(self) - - for name, prefix in _QUERY_TERM_MAP.items(): - self.add_prefix(name, prefix) - - def _parse_query_term(self, name, prefix, value): - if isinstance(m_value, list): - subqueries = [self._parse_query_term(name, prefix, word) - for word in value] - return Query(Query.OP_OR, subqueries) - - else: - return Query(prefix+str(value)) - - def _parse_query_value_range(self, name, value, value_no): - if len(value) != 2: - raise TypeError( - 'Only tuples of size 2 have a defined meaning. ' - 'Did you mean to pass a list instead?') - - start, end = value - return Query(Query.OP_VALUE_RANGE, value_no, str(start), str(end)) - - def _parse_query_value(self, name, value_no, value): - if isinstance(value, list): - subqueries = [self._parse_query_value(name, value_no, word) - for word in value] - return Query(Query.OP_OR, subqueries) - - elif isinstance(value, tuple): - return self._parse_query_value_range(name, value, value_no) - - elif isinstance(value, dict): - # compatibility option for timestamp: {'start': 0, 'end': 1} - start = value.get('start', 0) - end = value.get('end', sys.maxint) - return self._parse_query_value_range(name, (start, end), value_no) - - else: - return Query(Query.OP_VALUE_RANGE, - _QUERY_VALUE_MAP[name], str(value), str(value)) - - def _parse_query_xapian(self, query_str): - try: - return xapian.QueryParser.parse_query( - self, query_str, - QueryParser.FLAG_PHRASE | - QueryParser.FLAG_BOOLEAN | - QueryParser.FLAG_LOVEHATE | - QueryParser.FLAG_WILDCARD, - '') - - except xapian.QueryParserError, exception: - logging.warning('Invalid query string: '+exception.get_msg()) - return Query() - - def parse_query(self, query_dict, query_string): - logging.debug('parse_query %r %r', query_dict, query_string) - queries = [] - query_dict = dict(query_dict) - - if query_string: - queries.append(self._parse_query_xapian(str(query_string))) - - queries += [ - self._parse_query_term(name, prefix, query_dict.pop(name)) - for name, prefix in _QUERY_TERM_MAP.items() - if name in query_dict] - - queries += [ - self._parse_query_value(name, value_no, query_dict.pop(name)) - for name, value_no in _QUERY_VALUE_MAP.items() - if name in query_dict] - - if not queries: - queries.append(Query('')) - - if query_dict: - logging.warning('Unknown term(s): %r', query_dict) - - logging.debug('queries: %r', [str(q) for q in queries]) - return Query(Query.OP_AND, queries) - class IndexStore(object): """Index metadata and provide rich query facilities on it. @@ -210,17 +53,11 @@ class IndexStore(object): self._database = None self._flush_timeout = None self._pending_writes = 0 - self._query_parser = None - self._term_generator = None def open_index(self): index_path = layoutmanager.get_instance().get_index_path() self._database = WritableDatabase(index_path, xapian.DB_CREATE_OR_OPEN) - self._query_parser = QueryParser() - self._query_parser.set_database(self._database) - self._term_generator = TermGenerator() - def close_index(self): self._database.flush() self._database = None @@ -242,24 +79,60 @@ class IndexStore(object): def store(self, uid, properties): document = Document() + document.add_term(_PREFIX_UID + uid) + document.add_term(_PREFIX_ACTIVITY + properties.get('activity', '')) + document.add_term(_PREFIX_MIME_TYPE + properties.get('mime_type', '')) + document.add_term(_PREFIX_ACTIVITY_ID + + properties.get('activity_id', '')) + document.add_term(_PREFIX_KEEP + str(properties.get('keep', 0))) + document.add_value(_VALUE_UID, uid) - self._term_generator.index_document(document, properties) + document.add_value(_VALUE_TIMESTAMP, str(properties['timestamp'])) + document.add_value(_VALUE_TITLE, properties.get('title', '').strip()) + + term_generator = xapian.TermGenerator() + + # TODO: we should do stemming, but in which language? + #if language is not None: + # term_generator.set_stemmer(_xapian.Stem(language)) + + # TODO: we should use a stopper + #if stop is not None: + # stopper = _xapian.SimpleStopper() + # for term in stop: + # stopper.add (term) + # term_generator.set_stopper (stopper) + + term_generator.set_document(document) + term_generator.index_text_without_positions( + self._extract_text(properties), 1, '') if not self.contains(uid): self._database.add_document(document) else: self._database.replace_document(_PREFIX_UID + uid, document) - self._flush() + def _extract_text(self, properties): + text = '' + for key, value in properties.items(): + if key not in _PROPERTIES_NOT_TO_INDEX: + if text: + text += ' ' + if isinstance(value, unicode): + value = value.encode('utf-8') + elif not isinstance(value, basestring): + value = str(value) + text += value + return text + def find(self, query): offset = query.pop('offset', 0) limit = query.pop('limit', MAX_QUERY_LIMIT) order_by = query.pop('order_by', []) - query_string = query.pop('query', None) enquire = Enquire(self._database) - enquire.set_query(self._query_parser.parse_query(query, query_string)) + enquire.set_query(self._parse_query(query)) # This will assure that the results count is exact. check_at_least = offset + limit + 1 @@ -278,7 +151,7 @@ class IndexStore(object): elif order_by == '-title': enquire.set_sort_by_value(_VALUE_TITLE, False) else: - logging.warning('Unsupported property for sorting: %s', order_by) + logging.warning('Unsupported property for sorting: %s' % order_by) query_result = enquire.get_mset(offset, limit, check_at_least) total_count = query_result.get_matches_estimated() @@ -289,6 +162,70 @@ class IndexStore(object): return (uids, total_count) + def _parse_query(self, query_dict): + logging.debug('_parse_query %r' % query_dict) + queries = [] + + query_str = query_dict.pop('query', None) + if query_str is not None: + query_parser = QueryParser() + query_parser.set_database(self._database) + #query_parser.set_default_op(Query.OP_AND) + + # TODO: we should do stemming, but in which language? + #query_parser.set_stemmer(_xapian.Stem(lang)) + #query_parser.set_stemming_strategy(qp.STEM_SOME) + + query = query_parser.parse_query( + query_str, + QueryParser.FLAG_PHRASE | + QueryParser.FLAG_BOOLEAN | + QueryParser.FLAG_LOVEHATE | + QueryParser.FLAG_WILDCARD, + '') + + queries.append(query) + + timestamp = query_dict.pop('timestamp', None) + if timestamp is not None: + start = str(timestamp.pop('start', 0)) + end = str(timestamp.pop('end', _MAX_RESULTS)) + query = Query(Query.OP_VALUE_RANGE, _VALUE_TIMESTAMP, start, end) + queries.append(query) + + uid = query_dict.pop('uid', None) + if uid is not None: + queries.append(Query(_PREFIX_UID + uid)) + + activity = query_dict.pop('activity', None) + if activity is not None: + queries.append(Query(_PREFIX_ACTIVITY + activity)) + + activity_id = query_dict.pop('activity_id', None) + if activity_id is not None: + query = Query(_PREFIX_ACTIVITY_ID + activity_id) + queries.append(query) + + keep = query_dict.pop('keep', None) + if keep is not None: + query = Query(_PREFIX_KEEP + str(keep)) + queries.append(query) + + mime_type = query_dict.pop('mime_type', None) + if mime_type is not None: + mime_queries = [] + for mime_type in mime_type: + mime_queries.append(Query(_PREFIX_MIME_TYPE + mime_type)) + queries.append(Query(Query.OP_OR, mime_queries)) + + if not queries: + queries.append(Query('')) + + if query_dict: + logging.warning('Unknown term(s): %r' % query_dict) + + return Query(Query.OP_AND, queries) + def delete(self, uid): self._database.delete_document(_PREFIX_UID + uid) |