Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSascha Silbe <sascha@silbe.org>2009-08-06 13:40:32 (GMT)
committer Sascha Silbe <sascha@silbe.org>2009-08-13 22:36:16 (GMT)
commitdbe49d154adc61f05db68bb43173dd4d77c82690 (patch)
tree8b0ca46596a9b526a5f4846230cf087c275aaa06
parentd7ab281cabeab4fe3529ca1fa14b8a15895b9d36 (diff)
revised patch for IndexStore refactoring and prefix term support
-rw-r--r--src/carquinyol/datastore.py3
-rw-r--r--src/carquinyol/indexstore.py298
2 files changed, 194 insertions, 107 deletions
diff --git a/src/carquinyol/datastore.py b/src/carquinyol/datastore.py
index 41b16b5..729f4a9 100644
--- a/src/carquinyol/datastore.py
+++ b/src/carquinyol/datastore.py
@@ -16,6 +16,7 @@
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
import logging
+import locale
import uuid
import time
import os
@@ -50,6 +51,8 @@ class DataStore(dbus.service.Object):
"""
def __init__(self, **options):
+ # needed for locale-specific date parsing
+ locale.setlocale(locale.LC_ALL, '')
bus_name = dbus.service.BusName(DS_SERVICE,
bus=dbus.SessionBus(),
replace_existing=False,
diff --git a/src/carquinyol/indexstore.py b/src/carquinyol/indexstore.py
index 42c3132..25c1036 100644
--- a/src/carquinyol/indexstore.py
+++ b/src/carquinyol/indexstore.py
@@ -16,10 +16,12 @@
import logging
import os
+import sys
+import time
import gobject
import xapian
-from xapian import WritableDatabase, Document, Enquire, Query, QueryParser
+from xapian import WritableDatabase, Document, Enquire, Query
from carquinyol import layoutmanager
from carquinyol.layoutmanager import MAX_QUERY_LIMIT
@@ -40,10 +42,186 @@ _FLUSH_THRESHOLD = 20
# Force a flush after _n_ seconds since the last change to the db
_FLUSH_TIMEOUT = 60
-_PROPERTIES_NOT_TO_INDEX = ['timestamp', 'activity_id', 'keep', 'preview']
+_PROPERTIES_NOT_TO_INDEX = ['timestamp', 'preview']
_MAX_RESULTS = int(2 ** 31 - 1)
+_query_term_map = {
+ 'uid': _PREFIX_UID,
+ 'activity': _PREFIX_ACTIVITY,
+ 'activity_id': _PREFIX_ACTIVITY_ID,
+ 'mime_type': _PREFIX_MIME_TYPE,
+ 'keep': _PREFIX_KEEP,
+}
+
+_query_value_map = {
+ 'timestamp': _VALUE_TIMESTAMP,
+}
+
+
+class DateRangeProcessor (xapian.ValueRangeProcessor):
+ """
+ Xapian ValueRangeProcessor for dates given in locale-specific format.
+ Returns Unix timestamps as values.
+ """
+
+ def __init__(self, value_no):
+ self._value_no = value_no
+ xapian.ValueRangeProcessor.__init__(self)
+
+ def __call__(self, begin, end):
+ try:
+ # TODO: more flexible parsing - e.g. two-digit vs. four-digit
+ # year numbers
+ return (self._value_no,
+ str(time.mktime(time.strptime(begin, "%x"))),
+ str(time.mktime(time.strptime(end, "%x"))))
+ except ValueError:
+ return (xapian.BAD_VALUENO, begin, end)
+
+
+class TermGenerator (xapian.TermGenerator):
+
+ def index_document(self, document, properties):
+ document.add_value(_VALUE_TIMESTAMP, str(properties['timestamp']))
+ document.add_value(_VALUE_TITLE, properties.get('title', '').strip())
+
+ xapian.TermGenerator.set_document(self, document)
+
+ properties = dict(properties)
+ self._index_known(document, properties)
+ self._index_unknown(document, properties)
+
+ def _index_known(self, document, properties):
+ for (name, prefix) in _query_term_map.items():
+ if (name not in properties):
+ continue
+
+ self._index_property(document, name, properties.pop(name), prefix)
+
+ def _index_unknown(self, document, properties):
+ for (name, value) in properties.items():
+ self._index_property(document, name, value)
+
+ def _index_property(self, doc, name, value, prefix=''):
+ if name in _PROPERTIES_NOT_TO_INDEX or not value:
+ return
+
+ if isinstance(value, unicode):
+ value = value.encode('utf-8')
+ elif not isinstance(value, basestring):
+ value = str(value)
+
+ # add full value for dictionary-based searches
+ # TODO: change query parser to generate phrase query instead
+ doc.add_term(prefix+value)
+
+ # index with and without prefix so specifying a prefix is optional
+ # in query strings
+ if prefix:
+ self.index_text(value, 1, prefix)
+
+ self.index_text(value)
+ self.increase_termpos()
+
+
+class QueryParser (xapian.QueryParser):
+ """QueryParser that understands dictionaries and Xapian query strings.
+
+ The dictionary contains metadata names as keys and either basic types
+ (exact match), 2-tuples (range, only valid for value-stored metadata)
+ or a list (multiple exact matches joined with OR) as values.
+ An empty dictionary matches everything. Queries from different keys
+ (i.e. different metadata names) are joined with AND.
+ """
+
+ def __init__(self):
+ xapian.QueryParser.__init__(self)
+
+ for (name, prefix) in _query_term_map.items():
+ self.add_prefix(name, prefix)
+
+ self.add_valuerangeprocessor(DateRangeProcessor(_VALUE_TIMESTAMP))
+
+ def _parse_query_term(self, m_name, prefix, m_value):
+ if isinstance(m_value, list):
+ return Query(Query.OP_OR, [
+ self._parse_query_term(m_name, prefix, word)
+ for word in m_value])
+
+ else:
+ return Query(prefix+str(m_value))
+
+ def _parse_query_value_range(self, name, value, value_no):
+ if len(value) != 2:
+ raise TypeError(
+ "Only tuples of size 2 have a defined meaning. " \
+ "Did you mean to pass a list instead?")
+
+ start, end = value
+ return Query(Query.OP_VALUE_RANGE,
+ value_no, str(start), str(end))
+
+ def _parse_query_value(self, name, value_no, value):
+ if isinstance(value, list):
+ return Query(Query.OP_OR, [
+ self._parse_query_value(name, value_no, word)
+ for word in value])
+
+ elif isinstance(value, tuple):
+ return self._parse_query_value_range(name, value, value_no)
+
+ elif isinstance(value, dict):
+ # compatibility option for timestamp: {'start': 0, 'end': 1}
+ start = value.get('start', 0)
+ end = value.get('end', sys.maxint)
+ return self._parse_query_value_range(name, (start, end), value_no)
+
+ else:
+ return Query(Query.OP_VALUE_RANGE,
+ _query_value_map[name], str(value), str(value))
+
+ def _parse_query_xapian(self, query_str):
+ try:
+ return xapian.QueryParser.parse_query(
+ self, query_str,
+ QueryParser.FLAG_PHRASE |
+ QueryParser.FLAG_BOOLEAN |
+ QueryParser.FLAG_LOVEHATE |
+ QueryParser.FLAG_WILDCARD,
+ '')
+
+ except xapian.QueryParserError, exception:
+ logging.warning("Invalid query string: "+exception.get_msg())
+ return Query()
+
+ def parse_query(self, query_dict, query_string):
+ logging.debug('parse_query %r %r', query_dict, query_string)
+ queries = []
+ query_dict = dict(query_dict)
+
+ if query_string:
+ queries.append(self._parse_query_xapian(str(query_string)))
+
+ queries += [
+ self._parse_query_term(name, prefix, query_dict.pop(name))
+ for (name, prefix) in _query_term_map.items()
+ if name in query_dict]
+
+ queries += [
+ self._parse_query_value(name, value_no, query_dict.pop(name))
+ for (name, value_no) in _query_value_map.items()
+ if name in query_dict]
+
+ if not queries:
+ queries.append(Query(''))
+
+ if query_dict:
+ logging.warning('Unknown term(s): %r', query_dict)
+
+ logging.debug("queries: %r", [str(q) for q in queries])
+ return Query(Query.OP_AND, queries)
+
class IndexStore(object):
"""Index metadata and provide rich query facilities on it.
@@ -53,11 +231,17 @@ class IndexStore(object):
self._database = None
self._flush_timeout = None
self._pending_writes = 0
+ self._query_parser = None
+ self._term_generator = None
def open_index(self):
index_path = layoutmanager.get_instance().get_index_path()
self._database = WritableDatabase(index_path, xapian.DB_CREATE_OR_OPEN)
+ self._query_parser = QueryParser()
+ self._query_parser.set_database(self._database)
+ self._term_generator = TermGenerator()
+
def close_index(self):
self._database.flush()
self._database = None
@@ -79,60 +263,24 @@ class IndexStore(object):
def store(self, uid, properties):
document = Document()
- document.add_term(_PREFIX_UID + uid)
- document.add_term(_PREFIX_ACTIVITY + properties.get('activity', ''))
- document.add_term(_PREFIX_MIME_TYPE + properties.get('mime_type', ''))
- document.add_term(_PREFIX_ACTIVITY_ID +
- properties.get('activity_id', ''))
- document.add_term(_PREFIX_KEEP + str(properties.get('keep', 0)))
-
document.add_value(_VALUE_UID, uid)
- document.add_value(_VALUE_TIMESTAMP, str(properties['timestamp']))
- document.add_value(_VALUE_TITLE, properties.get('title', '').strip())
-
- term_generator = xapian.TermGenerator()
-
- # TODO: we should do stemming, but in which language?
- #if language is not None:
- # term_generator.set_stemmer(_xapian.Stem(language))
-
- # TODO: we should use a stopper
- #if stop is not None:
- # stopper = _xapian.SimpleStopper()
- # for term in stop:
- # stopper.add (term)
- # term_generator.set_stopper (stopper)
-
- term_generator.set_document(document)
- term_generator.index_text_without_positions(
- self._extract_text(properties), 1, '')
+ self._term_generator.index_document(document, properties)
if not self.contains(uid):
self._database.add_document(document)
else:
self._database.replace_document(_PREFIX_UID + uid, document)
- self._flush()
- def _extract_text(self, properties):
- text = ''
- for key, value in properties.items():
- if key not in _PROPERTIES_NOT_TO_INDEX:
- if text:
- text += ' '
- if isinstance(value, unicode):
- value = value.encode('utf-8')
- elif not isinstance(value, basestring):
- value = str(value)
- text += value
- return text
+ self._flush()
def find(self, query):
offset = query.pop('offset', 0)
limit = query.pop('limit', MAX_QUERY_LIMIT)
order_by = query.pop('order_by', [])
+ query_string = query.pop('query', None)
enquire = Enquire(self._database)
- enquire.set_query(self._parse_query(query))
+ enquire.set_query(self._query_parser.parse_query(query, query_string))
# This will assure that the results count is exact.
check_at_least = offset + limit + 1
@@ -151,7 +299,7 @@ class IndexStore(object):
elif order_by == '-title':
enquire.set_sort_by_value(_VALUE_TITLE, False)
else:
- logging.warning('Unsupported property for sorting: %s' % order_by)
+ logging.warning('Unsupported property for sorting: %s', order_by)
query_result = enquire.get_mset(offset, limit, check_at_least)
total_count = query_result.get_matches_estimated()
@@ -162,70 +310,6 @@ class IndexStore(object):
return (uids, total_count)
- def _parse_query(self, query_dict):
- logging.debug('_parse_query %r' % query_dict)
- queries = []
-
- query_str = query_dict.pop('query', None)
- if query_str is not None:
- query_parser = QueryParser()
- query_parser.set_database(self._database)
- #query_parser.set_default_op(Query.OP_AND)
-
- # TODO: we should do stemming, but in which language?
- #query_parser.set_stemmer(_xapian.Stem(lang))
- #query_parser.set_stemming_strategy(qp.STEM_SOME)
-
- query = query_parser.parse_query(
- query_str,
- QueryParser.FLAG_PHRASE |
- QueryParser.FLAG_BOOLEAN |
- QueryParser.FLAG_LOVEHATE |
- QueryParser.FLAG_WILDCARD,
- '')
-
- queries.append(query)
-
- timestamp = query_dict.pop('timestamp', None)
- if timestamp is not None:
- start = str(timestamp.pop('start', 0))
- end = str(timestamp.pop('end', _MAX_RESULTS))
- query = Query(Query.OP_VALUE_RANGE, _VALUE_TIMESTAMP, start, end)
- queries.append(query)
-
- uid = query_dict.pop('uid', None)
- if uid is not None:
- queries.append(Query(_PREFIX_UID + uid))
-
- activity = query_dict.pop('activity', None)
- if activity is not None:
- queries.append(Query(_PREFIX_ACTIVITY + activity))
-
- activity_id = query_dict.pop('activity_id', None)
- if activity_id is not None:
- query = Query(_PREFIX_ACTIVITY_ID + activity_id)
- queries.append(query)
-
- keep = query_dict.pop('keep', None)
- if keep is not None:
- query = Query(_PREFIX_KEEP + str(keep))
- queries.append(query)
-
- mime_type = query_dict.pop('mime_type', None)
- if mime_type is not None:
- mime_queries = []
- for mime_type in mime_type:
- mime_queries.append(Query(_PREFIX_MIME_TYPE + mime_type))
- queries.append(Query(Query.OP_OR, mime_queries))
-
- if not queries:
- queries.append(Query(''))
-
- if query_dict:
- logging.warning('Unknown term(s): %r' % query_dict)
-
- return Query(Query.OP_AND, queries)
-
def delete(self, uid):
self._database.delete_document(_PREFIX_UID + uid)