diff options
author | Sascha Silbe <sascha@silbe.org> | 2009-08-06 13:40:32 (GMT) |
---|---|---|
committer | Sascha Silbe <sascha@silbe.org> | 2009-08-13 22:36:16 (GMT) |
commit | dbe49d154adc61f05db68bb43173dd4d77c82690 (patch) | |
tree | 8b0ca46596a9b526a5f4846230cf087c275aaa06 | |
parent | d7ab281cabeab4fe3529ca1fa14b8a15895b9d36 (diff) |
revised patch for IndexStore refactoring and prefix term support
-rw-r--r-- | src/carquinyol/datastore.py | 3 | ||||
-rw-r--r-- | src/carquinyol/indexstore.py | 298 |
2 files changed, 194 insertions, 107 deletions
diff --git a/src/carquinyol/datastore.py b/src/carquinyol/datastore.py index 41b16b5..729f4a9 100644 --- a/src/carquinyol/datastore.py +++ b/src/carquinyol/datastore.py @@ -16,6 +16,7 @@ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA import logging +import locale import uuid import time import os @@ -50,6 +51,8 @@ class DataStore(dbus.service.Object): """ def __init__(self, **options): + # needed for locale-specific date parsing + locale.setlocale(locale.LC_ALL, '') bus_name = dbus.service.BusName(DS_SERVICE, bus=dbus.SessionBus(), replace_existing=False, diff --git a/src/carquinyol/indexstore.py b/src/carquinyol/indexstore.py index 42c3132..25c1036 100644 --- a/src/carquinyol/indexstore.py +++ b/src/carquinyol/indexstore.py @@ -16,10 +16,12 @@ import logging import os +import sys +import time import gobject import xapian -from xapian import WritableDatabase, Document, Enquire, Query, QueryParser +from xapian import WritableDatabase, Document, Enquire, Query from carquinyol import layoutmanager from carquinyol.layoutmanager import MAX_QUERY_LIMIT @@ -40,10 +42,186 @@ _FLUSH_THRESHOLD = 20 # Force a flush after _n_ seconds since the last change to the db _FLUSH_TIMEOUT = 60 -_PROPERTIES_NOT_TO_INDEX = ['timestamp', 'activity_id', 'keep', 'preview'] +_PROPERTIES_NOT_TO_INDEX = ['timestamp', 'preview'] _MAX_RESULTS = int(2 ** 31 - 1) +_query_term_map = { + 'uid': _PREFIX_UID, + 'activity': _PREFIX_ACTIVITY, + 'activity_id': _PREFIX_ACTIVITY_ID, + 'mime_type': _PREFIX_MIME_TYPE, + 'keep': _PREFIX_KEEP, +} + +_query_value_map = { + 'timestamp': _VALUE_TIMESTAMP, +} + + +class DateRangeProcessor (xapian.ValueRangeProcessor): + """ + Xapian ValueRangeProcessor for dates given in locale-specific format. + Returns Unix timestamps as values. + """ + + def __init__(self, value_no): + self._value_no = value_no + xapian.ValueRangeProcessor.__init__(self) + + def __call__(self, begin, end): + try: + # TODO: more flexible parsing - e.g. two-digit vs. four-digit + # year numbers + return (self._value_no, + str(time.mktime(time.strptime(begin, "%x"))), + str(time.mktime(time.strptime(end, "%x")))) + except ValueError: + return (xapian.BAD_VALUENO, begin, end) + + +class TermGenerator (xapian.TermGenerator): + + def index_document(self, document, properties): + document.add_value(_VALUE_TIMESTAMP, str(properties['timestamp'])) + document.add_value(_VALUE_TITLE, properties.get('title', '').strip()) + + xapian.TermGenerator.set_document(self, document) + + properties = dict(properties) + self._index_known(document, properties) + self._index_unknown(document, properties) + + def _index_known(self, document, properties): + for (name, prefix) in _query_term_map.items(): + if (name not in properties): + continue + + self._index_property(document, name, properties.pop(name), prefix) + + def _index_unknown(self, document, properties): + for (name, value) in properties.items(): + self._index_property(document, name, value) + + def _index_property(self, doc, name, value, prefix=''): + if name in _PROPERTIES_NOT_TO_INDEX or not value: + return + + if isinstance(value, unicode): + value = value.encode('utf-8') + elif not isinstance(value, basestring): + value = str(value) + + # add full value for dictionary-based searches + # TODO: change query parser to generate phrase query instead + doc.add_term(prefix+value) + + # index with and without prefix so specifying a prefix is optional + # in query strings + if prefix: + self.index_text(value, 1, prefix) + + self.index_text(value) + self.increase_termpos() + + +class QueryParser (xapian.QueryParser): + """QueryParser that understands dictionaries and Xapian query strings. + + The dictionary contains metadata names as keys and either basic types + (exact match), 2-tuples (range, only valid for value-stored metadata) + or a list (multiple exact matches joined with OR) as values. + An empty dictionary matches everything. Queries from different keys + (i.e. different metadata names) are joined with AND. + """ + + def __init__(self): + xapian.QueryParser.__init__(self) + + for (name, prefix) in _query_term_map.items(): + self.add_prefix(name, prefix) + + self.add_valuerangeprocessor(DateRangeProcessor(_VALUE_TIMESTAMP)) + + def _parse_query_term(self, m_name, prefix, m_value): + if isinstance(m_value, list): + return Query(Query.OP_OR, [ + self._parse_query_term(m_name, prefix, word) + for word in m_value]) + + else: + return Query(prefix+str(m_value)) + + def _parse_query_value_range(self, name, value, value_no): + if len(value) != 2: + raise TypeError( + "Only tuples of size 2 have a defined meaning. " \ + "Did you mean to pass a list instead?") + + start, end = value + return Query(Query.OP_VALUE_RANGE, + value_no, str(start), str(end)) + + def _parse_query_value(self, name, value_no, value): + if isinstance(value, list): + return Query(Query.OP_OR, [ + self._parse_query_value(name, value_no, word) + for word in value]) + + elif isinstance(value, tuple): + return self._parse_query_value_range(name, value, value_no) + + elif isinstance(value, dict): + # compatibility option for timestamp: {'start': 0, 'end': 1} + start = value.get('start', 0) + end = value.get('end', sys.maxint) + return self._parse_query_value_range(name, (start, end), value_no) + + else: + return Query(Query.OP_VALUE_RANGE, + _query_value_map[name], str(value), str(value)) + + def _parse_query_xapian(self, query_str): + try: + return xapian.QueryParser.parse_query( + self, query_str, + QueryParser.FLAG_PHRASE | + QueryParser.FLAG_BOOLEAN | + QueryParser.FLAG_LOVEHATE | + QueryParser.FLAG_WILDCARD, + '') + + except xapian.QueryParserError, exception: + logging.warning("Invalid query string: "+exception.get_msg()) + return Query() + + def parse_query(self, query_dict, query_string): + logging.debug('parse_query %r %r', query_dict, query_string) + queries = [] + query_dict = dict(query_dict) + + if query_string: + queries.append(self._parse_query_xapian(str(query_string))) + + queries += [ + self._parse_query_term(name, prefix, query_dict.pop(name)) + for (name, prefix) in _query_term_map.items() + if name in query_dict] + + queries += [ + self._parse_query_value(name, value_no, query_dict.pop(name)) + for (name, value_no) in _query_value_map.items() + if name in query_dict] + + if not queries: + queries.append(Query('')) + + if query_dict: + logging.warning('Unknown term(s): %r', query_dict) + + logging.debug("queries: %r", [str(q) for q in queries]) + return Query(Query.OP_AND, queries) + class IndexStore(object): """Index metadata and provide rich query facilities on it. @@ -53,11 +231,17 @@ class IndexStore(object): self._database = None self._flush_timeout = None self._pending_writes = 0 + self._query_parser = None + self._term_generator = None def open_index(self): index_path = layoutmanager.get_instance().get_index_path() self._database = WritableDatabase(index_path, xapian.DB_CREATE_OR_OPEN) + self._query_parser = QueryParser() + self._query_parser.set_database(self._database) + self._term_generator = TermGenerator() + def close_index(self): self._database.flush() self._database = None @@ -79,60 +263,24 @@ class IndexStore(object): def store(self, uid, properties): document = Document() - document.add_term(_PREFIX_UID + uid) - document.add_term(_PREFIX_ACTIVITY + properties.get('activity', '')) - document.add_term(_PREFIX_MIME_TYPE + properties.get('mime_type', '')) - document.add_term(_PREFIX_ACTIVITY_ID + - properties.get('activity_id', '')) - document.add_term(_PREFIX_KEEP + str(properties.get('keep', 0))) - document.add_value(_VALUE_UID, uid) - document.add_value(_VALUE_TIMESTAMP, str(properties['timestamp'])) - document.add_value(_VALUE_TITLE, properties.get('title', '').strip()) - - term_generator = xapian.TermGenerator() - - # TODO: we should do stemming, but in which language? - #if language is not None: - # term_generator.set_stemmer(_xapian.Stem(language)) - - # TODO: we should use a stopper - #if stop is not None: - # stopper = _xapian.SimpleStopper() - # for term in stop: - # stopper.add (term) - # term_generator.set_stopper (stopper) - - term_generator.set_document(document) - term_generator.index_text_without_positions( - self._extract_text(properties), 1, '') + self._term_generator.index_document(document, properties) if not self.contains(uid): self._database.add_document(document) else: self._database.replace_document(_PREFIX_UID + uid, document) - self._flush() - def _extract_text(self, properties): - text = '' - for key, value in properties.items(): - if key not in _PROPERTIES_NOT_TO_INDEX: - if text: - text += ' ' - if isinstance(value, unicode): - value = value.encode('utf-8') - elif not isinstance(value, basestring): - value = str(value) - text += value - return text + self._flush() def find(self, query): offset = query.pop('offset', 0) limit = query.pop('limit', MAX_QUERY_LIMIT) order_by = query.pop('order_by', []) + query_string = query.pop('query', None) enquire = Enquire(self._database) - enquire.set_query(self._parse_query(query)) + enquire.set_query(self._query_parser.parse_query(query, query_string)) # This will assure that the results count is exact. check_at_least = offset + limit + 1 @@ -151,7 +299,7 @@ class IndexStore(object): elif order_by == '-title': enquire.set_sort_by_value(_VALUE_TITLE, False) else: - logging.warning('Unsupported property for sorting: %s' % order_by) + logging.warning('Unsupported property for sorting: %s', order_by) query_result = enquire.get_mset(offset, limit, check_at_least) total_count = query_result.get_matches_estimated() @@ -162,70 +310,6 @@ class IndexStore(object): return (uids, total_count) - def _parse_query(self, query_dict): - logging.debug('_parse_query %r' % query_dict) - queries = [] - - query_str = query_dict.pop('query', None) - if query_str is not None: - query_parser = QueryParser() - query_parser.set_database(self._database) - #query_parser.set_default_op(Query.OP_AND) - - # TODO: we should do stemming, but in which language? - #query_parser.set_stemmer(_xapian.Stem(lang)) - #query_parser.set_stemming_strategy(qp.STEM_SOME) - - query = query_parser.parse_query( - query_str, - QueryParser.FLAG_PHRASE | - QueryParser.FLAG_BOOLEAN | - QueryParser.FLAG_LOVEHATE | - QueryParser.FLAG_WILDCARD, - '') - - queries.append(query) - - timestamp = query_dict.pop('timestamp', None) - if timestamp is not None: - start = str(timestamp.pop('start', 0)) - end = str(timestamp.pop('end', _MAX_RESULTS)) - query = Query(Query.OP_VALUE_RANGE, _VALUE_TIMESTAMP, start, end) - queries.append(query) - - uid = query_dict.pop('uid', None) - if uid is not None: - queries.append(Query(_PREFIX_UID + uid)) - - activity = query_dict.pop('activity', None) - if activity is not None: - queries.append(Query(_PREFIX_ACTIVITY + activity)) - - activity_id = query_dict.pop('activity_id', None) - if activity_id is not None: - query = Query(_PREFIX_ACTIVITY_ID + activity_id) - queries.append(query) - - keep = query_dict.pop('keep', None) - if keep is not None: - query = Query(_PREFIX_KEEP + str(keep)) - queries.append(query) - - mime_type = query_dict.pop('mime_type', None) - if mime_type is not None: - mime_queries = [] - for mime_type in mime_type: - mime_queries.append(Query(_PREFIX_MIME_TYPE + mime_type)) - queries.append(Query(Query.OP_OR, mime_queries)) - - if not queries: - queries.append(Query('')) - - if query_dict: - logging.warning('Unknown term(s): %r' % query_dict) - - return Query(Query.OP_AND, queries) - def delete(self, uid): self._database.delete_document(_PREFIX_UID + uid) |