Add secore. Cut and paste from http://flaxcode.googlecode.com/svn/trunk/libs/secore/secore/.

author: Marco Pesenti Gritti <marco@localhost.localdomain> 2007-07-11 19:37:48 (GMT)
committer: Marco Pesenti Gritti <marco@localhost.localdomain> 2007-07-11 19:37:48 (GMT)
commit: 3a3a2c361fbf670ee5375e669d34be386f6924f8 (patch)
tree: b436ec29c47fdb983e8355c31768a1e3d2b10a6c
parent: cb8a3f7e34b07a4d3fb3ebb3cb7eddceaec0e73d (diff)
13 files changed, 2218 insertions, 1 deletions
diff --git a/Makefile.am b/Makefile.am
index 8060aae..abf71cf 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,4 +1,4 @@
-SUBDIRS = bin etc src
+SUBDIRS = bin etc secore src
 
 test:
 	@cd tests
diff --git a/configure.ac b/configure.ac
index 4824635..c60229a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -12,6 +12,7 @@ AC_OUTPUT([
 Makefile
 bin/Makefile
 etc/Makefile
+secore/Makefile
 src/Makefile
 src/olpc/Makefile
 src/olpc/datastore/Makefile
diff --git a/secore/Makefile.am b/secore/Makefile.am
new file mode 100644
index 0000000..393ba8f
--- /dev/null
+++ b/secore/Makefile.am
@@ -0,0 +1,12 @@
+datastoredir = $(pythondir)/secore
+datastore_PYTHON =		\
+	__init__.py		\
+	datastructures.py	\
+	fieldmappings.py	\
+	searchconnection.py	\
+	errors.py          	\
+	highlight.py		\
+	marshall.py		\
+	fieldactions.py		\
+	indexerconnection.py	\
+	parsedate.py
diff --git a/secore/__init__.py b/secore/__init__.py
new file mode 100644
index 0000000..157fea4
--- /dev/null
+++ b/secore/__init__.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2007 Lemur Consulting Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+"""Search engine Core.
+
+See the accompanying documentation for details.  In particular, there should be
+an accompanying file "introduction.html" (or "introduction.rst") which gives
+details of how to use the secore package.
+
+"""
+__docformat__ = "restructuredtext en"
+
+from datastructures import *
+from errors import *
+from indexerconnection import *
+from searchconnection import *
diff --git a/secore/datastructures.py b/secore/datastructures.py
new file mode 100644
index 0000000..414625d
--- /dev/null
+++ b/secore/datastructures.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2007 Lemur Consulting Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+r"""datastructures.py: Datastructures for search engine core.
+
+"""
+__docformat__ = "restructuredtext en"
+
+import xapian as _xapian
+import cPickle as _cPickle
+
+class Field(object):
+    # Use __slots__ because we're going to have very many Field objects in
+    # typical usage.
+    __slots__ = 'name', 'value'
+
+    def __init__(self, name, value):
+        self.name = name
+        self.value = value
+
+    def __repr__(self):
+        return 'Field(%r, %r)' % (self.name, self.value)
+
+class UnprocessedDocument(object):
+    """A unprocessed document to be passed to the indexer.
+
+    This represents an item to be processed and stored in the search engine.
+    Each document will be processed by the indexer to generate a
+    ProcessedDocument, which can then be stored in the search engine index.
+
+    Note that some information in an UnprocessedDocument will not be
+    represented in the ProcessedDocument: therefore, it is not possible to
+    retrieve an UnprocessedDocument from the search engine index.
+
+    An unprocessed document is a simple container with two attributes:
+
+     - `fields` is a list of Field objects.
+     - `id` is a string holding a unique identifier for the document (or
+       None to get the database to allocate a unique identifier automatically
+       when the document is added).
+
+    """
+
+    __slots__ = 'id', 'fields',
+    def __init__(self, id=None, fields=None):
+        self.id = id
+        if fields is None:
+            self.fields = []
+        else:
+            self.fields = fields
+
+    def __repr__(self):
+        return 'UnprocessedDocument(%r, %r)' % (self.id, self.fields)
+
+class ProcessedDocument(object):
+    """A processed document, as stored in the index.
+
+    This represents an item which is ready to be stored in the search engine,
+    or which has been returned by the search engine.
+
+    """
+
+    __slots__ = '_doc', '_fieldmappings', '_data',
+    def __init__(self, fieldmappings, xapdoc=None):
+        """Create a ProcessedDocument.
+
+        `fieldmappings` is the configuration from a database connection used lookup
+        the configuration to use to store each field.
+    
+        If supplied, `xapdoc` is a Xapian document to store in the processed
+        document.  Otherwise, a new Xapian document is created.
+
+        """
+        if xapdoc is None:
+            self._doc = _xapian.Document()
+        else:
+            self._doc = xapdoc
+        self._fieldmappings = fieldmappings
+        self._data = None
+
+    def add_term(self, field, term, wdfinc=1, positions=None):
+        """Add a term to the document.
+
+        Terms are the main unit of information used for performing searches.
+
+        - `field` is the field to add the term to.
+        - `term` is the term to add.
+        - `wdfinc` is the value to increase the within-document-frequency
+          measure for the term by.
+        - `positions` is the positional information to add for the term.
+          This may be None to indicate that there is no positional information,
+          or may be an integer to specify one position, or may be a sequence of
+          integers to specify several positions.  (Note that the wdf is not
+          increased automatically for each position: if you add a term at 7
+          positions, and the wdfinc value is 2, the total wdf for the term will
+          only be increased by 2, not by 14.)
+
+        """
+        prefix = self._fieldmappings.get_prefix(field)
+        if len(term) > 0:
+            # We use the following check, rather than "isupper()" to ensure
+            # that we match the check performed by the queryparser, regardless
+            # of our locale.
+            if ord(term[0]) >= ord('A') and ord(term[0]) <= ord('Z'):
+                prefix = prefix + ':'
+        if positions is None:
+            self._doc.add_term(prefix + term, wdfinc)
+        elif isinstance(positions, int):
+            self._doc.add_posting(prefix + term, positions, wdfinc)
+        else:
+            self._doc.add_term(prefix + term, wdfinc)
+            for pos in positions:
+                self._doc.add_posting(prefix + term, pos, 0)
+
+    def add_value(self, field, value):
+        """Add a value to the document.
+
+        Values are additional units of information used when performing
+        searches.  Note that values are _not_ intended to be used to store
+        information for display in the search results - use the document data
+        for that.  The intention is that as little information as possible is
+        stored in values, so that they can be accessed as quickly as possible
+        during the search operation.
+        
+        Unlike terms, each document may have at most one value in each field
+        (whereas there may be an arbitrary number of terms in a given field).
+        If an attempt to add multiple values to a single field is made, only
+        the last value added will be stored.
+
+        """
+        slot = self._fieldmappings.get_slot(field)
+        self._doc.add_value(slot, value)
+
+    def get_value(self, field):
+        """Get a value from the document.
+
+        """
+        slot = self._fieldmappings.get_slot(field)
+        return self._doc.get_value(slot)
+
+    def prepare(self):
+        """Prepare the document for adding to a xapian database.
+
+        This updates the internal xapian document with any changes which have
+        been made, and then returns it.
+
+        """
+        if self._data is not None:
+            self._doc.set_data(_cPickle.dumps(self._data, 2))
+            self._data = None
+        return self._doc
+
+    def _get_data(self):
+        if self._data is None:
+            rawdata = self._doc.get_data()
+            if rawdata == '':
+                self._data = {}
+            else:
+                self._data = _cPickle.loads(rawdata)
+        return self._data
+    def _set_data(self, data):
+        if not isinstance(data, dict):
+            raise TypeError("Cannot set data to any type other than a dict")
+        self._data = data
+    data = property(_get_data, _set_data, doc=
+    """The data stored in this processed document.
+
+    This data is a dictionary of entries, where the key is a fieldname, and the
+    value is a list of strings.
+
+    """)
+
+    def _get_id(self):
+        tl = self._doc.termlist()
+        try:
+            term = tl.skip_to('Q').term
+            if len(term) == 0 or term[0] != 'Q':
+                return None
+        except StopIteration:
+            return None
+        return term[1:]
+    def _set_id(self, id):
+        tl = self._doc.termlist()
+        try:
+            term = tl.skip_to('Q').term
+        except StopIteration:
+            term = ''
+        if len(term) != 0 and term[0] == 'Q':
+            self._doc.remove_term(term)
+        if id is not None:
+            self._doc.add_term('Q' + id, 0)
+    id = property(_get_id, _set_id, doc=
+    """The unique ID for this document.
+
+    """)
+
+    def __repr__(self):
+        return '<ProcessedDocument(%r)>' % (self.id)
+
+if __name__ == '__main__':
+    import doctest, sys
+    doctest.testmod (sys.modules[__name__])
diff --git a/secore/errors.py b/secore/errors.py
new file mode 100644
index 0000000..b6ad00f
--- /dev/null
+++ b/secore/errors.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2007 Lemur Consulting Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+r"""errors.py: Exceptions for the search engine core.
+
+"""
+__docformat__ = "restructuredtext en"
+
+class SearchEngineError(Exception):
+    r"""Base class for exceptions thrown by the search engine.
+
+    Any errors generated by the python level interface to xapian will be
+    instances of this class or its subclasses.
+
+    """
+
+class IndexerError(SearchEngineError):
+    r"""Class used to report errors from the indexing API.
+
+    """
+
+class SearchError(SearchEngineError):
+    r"""Class used to report errors from the search API.
+
+    """
+
diff --git a/secore/fieldactions.py b/secore/fieldactions.py
new file mode 100644
index 0000000..c595f0b
--- /dev/null
+++ b/secore/fieldactions.py
@@ -0,0 +1,358 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2007 Lemur Consulting Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+r"""fieldactions.py: Definitions and implementations of field actions.
+
+"""
+__docformat__ = "restructuredtext en"
+
+import errors as _errors
+import marshall as _marshall
+import xapian as _xapian
+import parsedate as _parsedate
+
+def _act_store_content(fieldname, doc, value, context):
+    """Perform the STORE_CONTENT action.
+    
+    """
+    try:
+        fielddata = doc.data[fieldname]
+    except KeyError:
+        fielddata = []
+        doc.data[fieldname] = fielddata
+    fielddata.append(value)
+
+def _act_index_exact(fieldname, doc, value, context):
+    """Perform the INDEX_EXACT action.
+    
+    """
+    doc.add_term(fieldname, value, 0)
+
+def _act_index_freetext(fieldname, doc, value, context, weight=1, 
+                        language=None, stop=None, spell=False,
+                        nopos=False, noprefix=False):
+    """Perform the INDEX_FREETEXT action.
+    
+    """
+    termgen = _xapian.TermGenerator()
+    if language is not None:
+        termgen.set_stemmer(_xapian.Stem(language))
+        
+    if stop is not None:
+        stopper = _xapian.SimpleStopper()
+        for term in stop:
+            stopper.add (term)
+        termgen.set_stopper (stopper)
+
+    if spell:
+        termgen.set_database(context.index)
+        termgen.set_flags(termgen.FLAG_SPELLING)
+    
+    termgen.set_document(doc._doc)
+    termgen.set_termpos(context.current_position)
+    if nopos:
+        termgen.index_text_without_positions(value, weight, '')
+    else:
+        termgen.index_text(value, weight, '')
+
+    if not noprefix:
+        # Store a second copy of the term with a prefix, for field-specific
+        # searches.
+        prefix = doc._fieldmappings.get_prefix(fieldname)
+        if len(prefix) != 0:
+            termgen.set_termpos(context.current_position)
+            if nopos:
+                termgen.index_text_without_positions(value, weight, prefix)
+            else:
+                termgen.index_text(value, weight, prefix)
+
+    # Add a gap between each field instance, so that phrase searches don't
+    # match across instances.
+    termgen.increase_termpos(10)
+    context.current_position = termgen.get_termpos()
+
+class SortableMarshaller(object):
+    """Implementation of marshalling for sortable values.
+
+    """
+    def __init__(self, indexing=True):
+        if indexing:
+            self._err = _errors.IndexerError
+        else:
+            self._err = _errors.SearchError
+
+    def marshall_string(self, fieldname, value):
+        """Marshall a value for sorting in lexicograpical order.
+
+        This returns the input as the output, since strings already sort in
+        lexicographical order.
+
+        """
+        return value
+
+    def marshall_float(self, fieldname, value):
+        """Marshall a value for sorting as a floating point value.
+
+        """
+        # convert the value to a float
+        try:
+            value = float(value)
+        except ValueError:
+            raise self._err("Value supplied to field %r must be a "
+                            "valid floating point number: was %r" %
+                            (fieldname, value))
+        return _marshall.float_to_string(value)
+
+    def marshall_date(self, fieldname, value):
+        """Marshall a value for sorting as a date.
+
+        """
+        try:
+            value = _parsedate.date_from_string(value)
+        except ValueError, e:
+            raise self._err("Value supplied to field %r must be a "
+                            "valid date: was %r: error is '%s'" %
+                            (fieldname, value, str(e)))
+        return _marshall.date_to_string(value)
+
+    def get_marshall_function(self, fieldname, sorttype):
+        """Get a function used to marshall values of a given sorttype.
+
+        """
+        try:
+            return {
+                None: self.marshall_string,
+                'string': self.marshall_string,
+                'float': self.marshall_float,
+                'date': self.marshall_date,
+            }[sorttype]
+        except KeyError:
+            raise self._err("Unknown sort type %r for field %r" %
+                            (sorttype, fieldname))
+
+
+def _act_sort_and_collapse(fieldname, doc, value, context, type=None):
+    """Perform the SORTABLE action.
+
+    """
+    marshaller = SortableMarshaller()
+    fn = marshaller.get_marshall_function(fieldname, type)
+    value = fn(fieldname, value)
+    doc.add_value(fieldname, value)
+
+class ActionContext(object):
+    """The context in which an action is performed.
+
+    This is just used to pass term generators, word positions, and the like
+    around.
+
+    """
+    def __init__(self, index):
+        self.current_language = None
+        self.current_position = 0
+        self.index = index
+
+class FieldActions(object):
+    """An object describing the actions to be performed on a field.
+
+    The supported actions are:
+    
+    - `STORE_CONTENT`: store the unprocessed content of the field in the search
+      engine database.  All fields which need to be displayed or used when
+      displaying the search results need to be given this action.
+
+    - `INDEX_EXACT`: index the exact content of the field as a single search
+      term.  Fields whose contents need to be searchable as an "exact match"
+      need to be given this action.
+
+    - `INDEX_FREETEXT`: index the content of this field as text.  The content
+      will be split into terms, allowing free text searching of the field.  Four
+      optional parameters may be supplied:
+
+      - 'weight' is a multiplier to apply to the importance of the field.  This
+        must be an integer, and the default value is 1.
+      - 'language' is the language to use when processing the field.  This can
+        be expressed as an ISO 2-letter language code.  The supported languages
+        are those supported by the xapian core in use.
+      - 'stop' is an iterable of stopwords to filter out of the generated
+        terms.  Note that due to Xapian design, only non-positional terms are
+        affected, so this is of limited use.
+      - 'spell' is a boolean flag - if true, the contents of the field will be
+        used for spelling correction.
+      - 'nopos' is a boolean flag - if true, positional information is not
+        stored.
+      - 'noprefix' is a boolean flag - if true, prevents terms with the field
+        prefix being generated.  This means that searches specific to this
+        field will not work, and thus should only be used for special cases.
+
+    - `SORTABLE`: index the content of the field such that it can be used to
+      sort result sets.  It also allows result sets to be restricted to those
+      documents with a field values in a given range.  One optional parameter
+      may be supplied:
+
+      - 'type' is a value indicating how to sort the field.  It has several
+        possible values:
+
+        - 'string' - sort in lexicographic (ie, alphabetical) order.
+          This is the default, used if no type is set.
+        - 'float' - treat the values as (decimal representations of) floating
+          point numbers, and sort in numerical order .  The values in the field
+          must be valid floating point numbers (according to Python's float()
+          function).
+        - 'date' - sort in date order.  The values must be valid dates (either
+          Python datetime.date objects, or ISO 8601 format (ie, YYYYMMDD or
+          YYYY-MM-DD).
+
+    - `COLLAPSE`: index the content of the field such that it can be used to
+      "collapse" result sets, such that only the highest result with each value
+      of the field will be returned.
+
+    """
+
+    # See the class docstring for the meanings of the following constants.
+    STORE_CONTENT = 1
+    INDEX_EXACT = 2
+    INDEX_FREETEXT = 3
+    SORTABLE = 4 
+    COLLAPSE = 5
+
+    # Sorting and collapsing store the data in a value, but the format depends
+    # on the sort type.  Easiest way to implement is to treat them as the same
+    # action.
+    SORT_AND_COLLAPSE = -1
+
+    # NEED_SLOT is a flag used to indicate that an action needs a slot number
+    NEED_SLOT = 1
+    # NEED_PREFIX is a flag used to indicate that an action needs a prefix
+    NEED_PREFIX = 2
+
+    def __init__(self, fieldname):
+        # Dictionary of actions, keyed by type.
+        self._actions = {}
+        self._fieldname = fieldname
+
+    def add(self, field_mappings, action, **kwargs):
+        """Add an action to perform on a field.
+
+        """
+        if action not in (FieldActions.STORE_CONTENT,
+                          FieldActions.INDEX_EXACT,
+                          FieldActions.INDEX_FREETEXT,
+                          FieldActions.SORTABLE,
+                          FieldActions.COLLAPSE,):
+            raise _errors.IndexerError("Unknown field action: %r" % action)
+
+        info = self._action_info[action]
+
+        # Check parameter names
+        for key in kwargs.keys():
+            if key not in info[1]:
+                raise _errors.IndexerError("Unknown parameter name for action %r: %r" % (info[0], key))
+
+        # Fields cannot be indexed both with "EXACT" and "FREETEXT": whilst we
+        # could implement this, the query parser wouldn't know what to do with
+        # searches.
+        if action == FieldActions.INDEX_EXACT:
+            if FieldActions.INDEX_FREETEXT in self._actions:
+                raise _errors.IndexerError("Field %r is already marked for indexing "
+                                   "as free text: cannot mark for indexing "
+                                   "as exact text as well" % self._fieldname)
+        if action == FieldActions.INDEX_FREETEXT:
+            if FieldActions.INDEX_EXACT in self._actions:
+                raise _errors.IndexerError("Field %r is already marked for indexing "
+                                   "as exact text: cannot mark for indexing "
+                                   "as free text as well" % self._fieldname)
+
+        # Fields cannot be indexed as more than one type for "SORTABLE": to
+        # implement this, we'd need to use a different prefix for each sortable
+        # type, but even then the search end wouldn't know what to sort on when
+        # searching.  Also, if they're indexed as "COLLAPSE", the value must be
+        # stored in the right format for the type "SORTABLE".
+        if action == FieldActions.SORTABLE or action == FieldActions.COLLAPSE:
+            if action == FieldActions.COLLAPSE:
+                sorttype = None
+            else:
+                try:
+                    sorttype = kwargs['type']
+                except KeyError:
+                    sorttype = 'string'
+            kwargs['type'] = sorttype
+            action = FieldActions.SORT_AND_COLLAPSE
+
+            try:
+                oldsortactions = self._actions[FieldActions.SORT_AND_COLLAPSE]
+            except KeyError:
+                oldsortactions = ()
+
+            if len(oldsortactions) > 0:
+                for oldsortaction in oldsortactions:
+                    oldsorttype = oldsortaction['type']
+
+                if sorttype == oldsorttype or oldsorttype is None:
+                    # Use new type
+                    self._actions[action] = []
+                elif sorttype is None:
+                    # Use old type
+                    return
+                else:
+                    raise _errors.IndexerError("Field %r is already marked for "
+                                               "sorting, with a different "
+                                               "sort type" % self._fieldname)
+        
+        if self.NEED_PREFIX in info[3]:
+            field_mappings.add_prefix(self._fieldname)
+        if self.NEED_SLOT in info[3]:
+            field_mappings.add_slot(self._fieldname)
+
+        # Make an entry for the action
+        if action not in self._actions:
+            self._actions[action] = []
+
+        # Check for repetitions of actions
+        for old_action in self._actions[action]:
+            if old_action == kwargs:
+                return
+
+        # Append the action to the list of actions
+        self._actions[action].append(kwargs)
+
+    def perform(self, doc, value, context):
+        """Perform the actions on the field.
+
+        - `doc` is a ProcessedDocument to store the result of the actions in.
+        - `value` is a string holding the value of the field.
+        - `context` is an ActionContext object used to keep state in.
+
+        """
+        for type, actionlist in self._actions.iteritems():
+            info = self._action_info[type]            
+            for kwargs in actionlist:
+                info[2](self._fieldname, doc, value, context, **kwargs)
+
+    _action_info = {
+        STORE_CONTENT: ('STORE_CONTENT', (), _act_store_content, (), ),
+        INDEX_EXACT: ('INDEX_EXACT', (), _act_index_exact, (NEED_PREFIX,), ),
+        INDEX_FREETEXT: ('INDEX_FREETEXT', ('weight', 'language', 'stop', 'spell', 'nopos', 'noprefix', ), 
+            _act_index_freetext, (NEED_PREFIX, ), ),
+        SORTABLE: ('SORTABLE', ('type', ), None, (NEED_SLOT,), ),
+        COLLAPSE: ('COLLAPSE', (), None, (NEED_SLOT,), ),
+        SORT_AND_COLLAPSE: ('SORT_AND_COLLAPSE', ('type', ), _act_sort_and_collapse, (NEED_SLOT,), ),
+    }
+
+if __name__ == '__main__':
+    import doctest, sys
+    doctest.testmod (sys.modules[__name__])
diff --git a/secore/fieldmappings.py b/secore/fieldmappings.py
new file mode 100644
index 0000000..3838ce5
--- /dev/null
+++ b/secore/fieldmappings.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2007 Lemur Consulting Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+r"""fieldmappings.py: Mappings from field names to term prefixes, etc.
+
+"""
+__docformat__ = "restructuredtext en"
+
+import cPickle as _cPickle
+
+class FieldMappings(object):
+    """Mappings from field names to term prefixes, slot values, etc.
+
+    The following mappings are maintained:
+
+    - a mapping from field name to the string prefix to insert at the start of
+      terms.
+    - a mapping from field name to the slot numbers to store the field contents
+      in.
+
+    """
+    __slots__ = '_prefixes', '_prefixcount', '_slots', '_slotcount', 
+
+    def __init__(self, serialised=None):
+        """Create a new field mapping object, or unserialise a saved one.
+
+        """
+        if serialised is not None:
+            (self._prefixes, self._prefixcount,
+             self._slots, self._slotcount) = _cPickle.loads(serialised)
+        else:
+            self._prefixes = {}
+            self._prefixcount = 0
+            self._slots = {}
+            self._slotcount = 0
+
+    def _genPrefix(self):
+        """Generate a previously unused prefix.
+
+        Prefixes are uppercase letters, and start with 'X' (this is a Xapian
+        convention, for compatibility with other Xapian tools: other starting
+        letters are reserved for special meanings):
+
+        >>> maps = FieldMappings()
+        >>> maps._genPrefix()
+        'XA'
+        >>> maps._genPrefix()
+        'XB'
+        >>> [maps._genPrefix() for i in xrange(60)]
+        ['XC', 'XD', 'XE', 'XF', 'XG', 'XH', 'XI', 'XJ', 'XK', 'XL', 'XM', 'XN', 'XO', 'XP', 'XQ', 'XR', 'XS', 'XT', 'XU', 'XV', 'XW', 'XX', 'XY', 'XZ', 'XAA', 'XBA', 'XCA', 'XDA', 'XEA', 'XFA', 'XGA', 'XHA', 'XIA', 'XJA', 'XKA', 'XLA', 'XMA', 'XNA', 'XOA', 'XPA', 'XQA', 'XRA', 'XSA', 'XTA', 'XUA', 'XVA', 'XWA', 'XXA', 'XYA', 'XZA', 'XAB', 'XBB', 'XCB', 'XDB', 'XEB', 'XFB', 'XGB', 'XHB', 'XIB', 'XJB']
+        >>> maps = FieldMappings()
+        >>> [maps._genPrefix() for i in xrange(27*26 + 5)][-10:]
+        ['XVZ', 'XWZ', 'XXZ', 'XYZ', 'XZZ', 'XAAA', 'XBAA', 'XCAA', 'XDAA', 'XEAA']
+        """
+        res = []
+        self._prefixcount += 1
+        num = self._prefixcount
+        while num != 0:
+            ch = (num - 1) % 26
+            res.append(chr(ch + ord('A')))
+            num -= ch
+            num = num // 26
+        return 'X' + ''.join(res)
+
+    def get_prefix(self, fieldname):
+        """Get the prefix used for a given field name.
+
+        """
+        return self._prefixes[fieldname]
+
+    def get_slot(self, fieldname):
+        """Get the slot number used for a given field name.
+
+        """
+        return self._slots[fieldname]
+
+    def add_prefix(self, fieldname):
+        """Allocate a prefix for the given field.
+
+        If a prefix is already allocated for this field, this has no effect.
+
+        """
+        if fieldname in self._prefixes:
+            return
+        self._prefixes[fieldname] = self._genPrefix()
+
+    def add_slot(self, fieldname):
+        """Allocate a slot number for the given field.
+
+        If a slot number is already allocated for this field, this has no effect.
+
+        """
+        if fieldname in self._slots:
+            return
+        self._slots[fieldname] = self._slotcount
+        self._slotcount += 1
+
+    def serialise(self):
+        """Serialise the field mappings to a string.
+
+        This can be unserialised by passing the result of this method to the
+        constructor of a new FieldMappings object.
+
+        """
+        return _cPickle.dumps((self._prefixes,
+                               self._prefixcount,
+                               self._slots,
+                               self._slotcount,
+                              ), 2)
diff --git a/secore/highlight.py b/secore/highlight.py
new file mode 100644
index 0000000..38f2050
--- /dev/null
+++ b/secore/highlight.py
@@ -0,0 +1,310 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2007 Lemur Consulting Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+r"""highlight.py: Highlight and summarise text.
+
+"""
+__docformat__ = "restructuredtext en"
+
+import re
+import xapian
+
+class Highlighter(object):
+    """Class for highlighting text and creating contextual summaries.
+
+    >>> hl = Highlighter("en")
+    >>> hl.makeSample('Hello world.', ['world'])
+    'Hello world.'
+    >>> hl.highlight('Hello world', ['world'], ('<', '>'))
+    'Hello <world>'
+
+    """
+
+    # split string into words, spaces, punctuation and markup tags
+    _split_re = re.compile(
+        '</\\w+>|<\\w+(?:\\s*\\w+="[^"]*"|\\s*\\w+)*\\s*>|[\\w\']+|\\s+|[^\\w\'\\s<>/]+')
+
+    def __init__(self, language_code='en', stemmer=None):
+        """Create a new highlighter for the specified language.
+
+        """
+        if stemmer is not None:
+            self.stem = stemmer
+        else:
+            self.stem = xapian.Stem(language_code)
+
+    def _split_text(self, text, strip_tags=False):
+        """Split some text into words and non-words.
+
+        - `text` is the text to process.  It may be a unicode object or a utf-8
+          encoded simple string.
+        - `strip_tags` is a flag - False to keep tags, True to strip all tags
+          from the output.
+
+        Returns a list of utf-8 encoded simple strings.
+
+        """
+        if isinstance(text, unicode):
+            text = text.encode('utf-8')
+
+        words = self._split_re.findall(text)
+        if strip_tags:
+            return [w for w in words if w[0] != '<']
+        else:
+            return words
+
+    def _strip_prefix(self, term):
+        """Strip the prefix off a term.
+
+        Prefixes are any initial capital letters, with the exception that R always
+        ends a prefix, even if followed by capital letters.
+
+        >>> hl = Highlighter("en")
+        >>> print hl._strip_prefix('hello')
+        hello
+        >>> print hl._strip_prefix('Rhello')
+        hello
+        >>> print hl._strip_prefix('XARHello')
+        Hello
+        >>> print hl._strip_prefix('XAhello')
+        hello
+        >>> print hl._strip_prefix('XAh')
+        h
+        >>> print hl._strip_prefix('XA')
+        <BLANKLINE>
+
+        """
+        for p in xrange(len(term)):
+            if term[p].islower():
+                return term[p:]
+            elif term[p] == 'R':
+                return term[p+1:]
+        return ''
+
+    def _query_to_stemmed_words(self, query):
+        """Convert a query to a list of stemmed words.
+
+        - `query` is the query to parse: it may be xapian.Query object, or a
+          sequence of terms.
+
+        """
+        if isinstance(query, xapian.Query):
+            return [self._strip_prefix(t) for t in query]
+        else:
+            return [self.stem(q.lower()) for q in query]
+
+
+    def makeSample(self, text, query, maxlen=600, hl=None):
+        """Make a contextual summary from the supplied text.
+
+        This basically works by splitting the text into phrases, counting the query
+        terms in each, and keeping those with the most.
+
+        Any markup tags in the text will be stripped.
+
+        `text` is the source text to summarise.
+        `query` is either a Xapian query object or a list of (unstemmed) term strings.
+        `maxlen` is the maximum length of the generated summary.
+        `hl` is a pair of strings to insert around highlighted terms, e.g. ('<b>', '</b>')
+
+        """
+
+        words = self._split_text(text, True)
+        terms = self._query_to_stemmed_words(query)
+
+        # build blocks delimited by puncuation, and count matching words in each block
+        # blocks[n] is a block [firstword, endword, charcount, termcount, selected]
+        blocks = []
+        start = end = count = blockchars = 0
+
+        while end < len(words):
+            blockchars += len(words[end])
+            if words[end].isalnum():
+                if self.stem(words[end].lower()) in terms:
+                    count += 1
+                end += 1
+            elif words[end] in ',.;:?!\n':
+                end += 1
+                blocks.append([start, end, blockchars, count, False])
+                start = end
+                blockchars = 0
+                count = 0
+            else:
+                end += 1
+        if start != end:
+            blocks.append([start, end, blockchars, count, False])
+        if len(blocks) == 0:
+            return ''
+
+        # select high-scoring blocks first, down to zero-scoring
+        chars = 0
+        for count in xrange(3, -1, -1):
+            for b in blocks:
+                if b[3] >= count:
+                    b[4] = True
+                    chars += b[2]
+                    if chars >= maxlen: break
+            if chars >= maxlen: break
+
+        # assemble summary
+        words2 = []
+        lastblock = -1
+        for i, b in enumerate(blocks):
+            if b[4]:
+                if i != lastblock + 1:
+                    words2.append('..')
+                words2.extend(words[b[0]:b[1]])
+                lastblock = i
+
+        if not blocks[-1][4]:
+            words2.append('..')
+
+        # trim down to maxlen
+        l = 0
+        for i in xrange (len (words2)):
+            l += len (words2[i])
+            if l >= maxlen:
+                words2[i:] = ['..']
+                break
+
+        if hl is None:
+            return ''.join(words2)
+        else:
+            return self._hl(words2, terms, hl)
+
+    def highlight(self, text, query, hl, strip_tags=False):
+        """Add highlights (string prefix/postfix) to a string.
+
+        `text` is the source to highlight.
+        `query` is either a Xapian query object or a list of (unstemmed) term strings.
+        `hl` is a pair of highlight strings, e.g. ('<i>', '</i>')
+        `strip_tags` strips HTML markout iff True
+
+        >>> hl = Highlighter()
+        >>> qp = xapian.QueryParser()
+        >>> q = qp.parse_query('cat dog')
+        >>> tags = ('[[', ']]')
+        >>> hl.highlight('The cat went Dogging; but was <i>dog tired</i>.', q, tags)
+        'The [[cat]] went [[Dogging]]; but was <i>[[dog]] tired</i>.'
+
+        """
+        words = self._split_text(text, strip_tags)
+        terms = self._query_to_stemmed_words(query)
+        return self._hl(words, terms, hl)
+
+    def _hl(self, words, terms, hl):
+        """Add highlights to a list of words.
+        
+        `words` is the list of words and non-words to be highlighted..
+        `terms` is the list of stemmed words to look for.
+
+        """
+        for i, w in enumerate(words):
+            if self.stem(words[i].lower()) in terms:
+                words[i] = ''.join((hl[0], w, hl[1]))
+
+        return ''.join(words)
+
+
+__test__ = {
+    'no_punc': r'''
+
+    Test the highlighter's behaviour when there is no punctuation in the sample
+    text (regression test - used to return no output):
+    >>> hl = Highlighter("en")
+    >>> hl.makeSample('Hello world', ['world'])
+    'Hello world'
+
+    ''',
+
+    'stem_levels': r'''
+
+    Test highlighting of words, and how it works with stemming:
+    >>> hl = Highlighter("en")
+
+    # "word" and "wording" stem to "word", so the following 4 calls all return
+    # the same thing
+    >>> hl.makeSample('Hello. word. wording. wordinging.', ['word'], hl='<>')
+    'Hello. <word>. <wording>. wordinging.'
+    >>> hl.highlight('Hello. word. wording. wordinging.', ['word'], '<>')
+    'Hello. <word>. <wording>. wordinging.'
+    >>> hl.makeSample('Hello. word. wording. wordinging.', ['wording'], hl='<>')
+    'Hello. <word>. <wording>. wordinging.'
+    >>> hl.highlight('Hello. word. wording. wordinging.', ['wording'], '<>')
+    'Hello. <word>. <wording>. wordinging.'
+
+    # "wordinging" stems to "wording", so only the last word is highlighted for
+    # this one.
+    >>> hl.makeSample('Hello. word. wording. wordinging.', ['wordinging'], hl='<>')
+    'Hello. word. wording. <wordinging>.'
+    >>> hl.highlight('Hello. word. wording. wordinging.', ['wordinging'], '<>')
+    'Hello. word. wording. <wordinging>.'
+    ''',
+
+    'supplied_stemmer': r'''
+
+    Test behaviour if we pass in our own stemmer:
+    >>> stem = xapian.Stem('en')
+    >>> hl = Highlighter(stemmer=stem)
+    >>> hl.highlight('Hello. word. wording. wordinging.', ['word'], '<>')
+    'Hello. <word>. <wording>. wordinging.'
+
+    ''',
+
+    'unicode': r'''
+
+    Test behaviour if we pass in unicode input:
+    >>> hl = Highlighter('en')
+    >>> hl.highlight(u'Hello\xf3. word. wording. wordinging.', ['word'], '<>')
+    'Hello\xc3\xb3. <word>. <wording>. wordinging.'
+
+    ''',
+
+    'no_sample': r'''
+
+    Test behaviour if we pass in unicode input:
+    >>> hl = Highlighter('en')
+    >>> hl.makeSample(u'', ['word'])
+    ''
+
+    ''',
+
+    'short_samples': r'''
+
+    >>> hl = Highlighter('en')
+    >>> hl.makeSample("A boring start.  Hello world indeed.  A boring end.", ['hello'], 20, ('<', '>'))
+    '..  <Hello> world ..'
+    >>> hl.makeSample("A boring start.  Hello world indeed.  A boring end.", ['hello'], 40, ('<', '>'))
+    'A boring start.  <Hello> world indeed...'
+    >>> hl.makeSample("A boring start.  Hello world indeed.  A boring end.", ['boring'], 40, ('<', '>'))
+    'A <boring> start...  A <boring> end.'
+
+    ''',
+
+    'apostrophes': r'''
+
+    >>> hl = Highlighter('en')
+    >>> hl.makeSample("A boring start.  Hello world's indeed.  A boring end.", ['world'], 40, ('<', '>'))
+    "A boring start.  Hello <world's> indeed..."
+
+    ''',
+
+}
+
+if __name__ == '__main__':
+    import doctest, sys
+    doctest.testmod (sys.modules[__name__])
diff --git a/secore/indexerconnection.py b/secore/indexerconnection.py
new file mode 100644
index 0000000..be82319
--- /dev/null
+++ b/secore/indexerconnection.py
@@ -0,0 +1,380 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2007 Lemur Consulting Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+r"""indexerconnection.py: A connection to the search engine for indexing.
+
+"""
+__docformat__ = "restructuredtext en"
+
+import xapian as _xapian
+from datastructures import *
+from fieldactions import *
+import fieldmappings as _fieldmappings
+import errors as _errors
+import os as _os
+import cPickle as _cPickle
+
+class IndexerConnection(object):
+    """A connection to the search engine for indexing.
+
+    """
+
+    def __init__(self, indexpath):
+        """Create a new connection to the index.
+
+        There may only be one indexer connection for a particular database open
+        at a given time.  Therefore, if a connection to the database is already
+        open, this will raise a xapian.DatabaseLockError.
+
+        If the database doesn't already exist, it will be created.
+
+        """
+        self._index = _xapian.WritableDatabase(indexpath, _xapian.DB_CREATE_OR_OPEN)
+        self._indexpath = indexpath
+
+        # Read existing actions.
+        self._field_actions = {}
+        self._field_mappings = _fieldmappings.FieldMappings()
+        self._next_docid = 0
+        self._config_modified = False
+        self._load_config()
+
+    def _store_config(self):
+        """Store the configuration for the database.
+
+        Currently, this stores the configuration in a file in the database
+        directory, so changes to it are not protected by transactions.  When
+        support is available in xapian for storing metadata associated with
+        databases. this will be used instead of a file.
+
+        """
+        config_str = _cPickle.dumps((
+                                     self._field_actions,
+                                     self._field_mappings.serialise(),
+                                     self._next_docid,
+                                    ), 2)
+        config_file = _os.path.join(self._indexpath, 'config')
+        fd = open(config_file, "w")
+        fd.write(config_str)
+        fd.close()
+        self._config_modified = False
+
+    def _load_config(self):
+        """Load the configuration for the database.
+
+        """
+        config_file = _os.path.join(self._indexpath, 'config')
+        if not _os.path.exists(config_file):
+            return
+        fd = open(config_file)
+        config_str = fd.read()
+        fd.close()
+
+        (self._field_actions, mappings, self._next_docid) = _cPickle.loads(config_str)
+        self._field_mappings = _fieldmappings.FieldMappings(mappings)
+        self._config_modified = False
+
+    def _allocate_id(self):
+        """Allocate a new ID.
+
+        """
+        while True:
+            idstr = "%x" % self._next_docid
+            self._next_docid += 1
+            if not self._index.term_exists('Q' + idstr):
+                break
+        self._config_modified = True
+        return idstr
+
+    def add_field_action(self, fieldname, fieldtype, **kwargs):
+        """Add an action to be performed on a field.
+
+        Note that this change to the configuration will not be preserved on
+        disk until the next call to flush().
+
+        """
+        if self._index is None:
+            raise _errors.IndexerError("IndexerConnection has been closed")
+        if fieldname in self._field_actions:
+            actions = self._field_actions[fieldname]
+        else:
+            actions = FieldActions(fieldname)
+            self._field_actions[fieldname] = actions
+        actions.add(self._field_mappings, fieldtype, **kwargs)
+        self._config_modified = True
+
+    def clear_field_actions(self, fieldname):
+        """Clear all actions for the specified field.
+
+        This does not report an error if there are already no actions for the
+        specified field.
+
+        Note that this change to the configuration will not be preserved on
+        disk until the next call to flush().
+
+        """
+        if self._index is None:
+            raise _errors.IndexerError("IndexerConnection has been closed")
+        if fieldname in self._field_actions:
+            del self._field_actions[fieldname]
+            self._config_modified = True
+
+    def process(self, document):
+        """Process an UnprocessedDocument with the settings in this database.
+
+        The resulting ProcessedDocument is returned.
+
+        Note that this processing will be automatically performed if an
+        UnprocessedDocument is supplied to the add() or replace() methods of
+        IndexerConnection.  This method is exposed to allow the processing to
+        be performed separately, which may be desirable if you wish to manually
+        modify the processed document before adding it to the database, or if
+        you want to split processing of documents from adding documents to the
+        database for performance reasons.
+
+        """
+        if self._index is None:
+            raise _errors.IndexerError("IndexerConnection has been closed")
+        result = ProcessedDocument(self._field_mappings)
+        result.id = document.id
+        context = ActionContext(self._index)
+
+        for field in document.fields:
+            try:
+                actions = self._field_actions[field.name]
+            except KeyError:
+                # If no actions are defined, just ignore the field.
+                continue
+            actions.perform(result, field.value, context)
+
+        return result
+
+    def add(self, document):
+        """Add a new document to the search engine index.
+
+        If the document has a id set, and the id already exists in
+        the database, an exception will be raised.  Use the replace() method
+        instead if you wish to overwrite documents.
+
+        Returns the id of the newly added document (making up a new
+        unique ID if no id was set).
+
+        The supplied document may be an instance of UnprocessedDocument, or an
+        instance of ProcessedDocument.
+
+        """
+        if self._index is None:
+            raise _errors.IndexerError("IndexerConnection has been closed")
+        if not hasattr(document, '_doc'):
+            # It's not a processed document.
+            document = self.process(document)
+
+        # Ensure that we have a id
+        orig_id = document.id
+        if orig_id is None:
+            id = self._allocate_id()
+            document.id = id
+        else:
+            id = orig_id
+            if self._index.term_exists('Q' + id):
+                raise _errors.IndexerError("Document ID of document supplied to add() is not unique.")
+            
+        # Add the document.
+        xapdoc = document.prepare()
+        self._index.add_document(xapdoc)
+
+        if id is not orig_id:
+            document.id = orig_id
+        return id
+
+    def replace(self, document):
+        """Replace a document in the search engine index.
+
+        If the document does not have a id set, an exception will be
+        raised.
+
+        If the document has a id set, and the id does not already
+        exist in the database, this method will have the same effect as add().
+
+        """
+        if self._index is None:
+            raise _errors.IndexerError("IndexerConnection has been closed")
+        if not hasattr(document, '_doc'):
+            # It's not a processed document.
+            document = self.process(document)
+
+        # Ensure that we have a id
+        id = document.id
+        if id is None:
+            raise _errors.IndexerError("No document ID set for document supplied to replace().")
+
+        xapdoc = document.prepare()
+        self._index.replace_document('Q' + id, xapdoc)
+
+    def delete(self, id):
+        """Delete a document from the search engine index.
+
+        If the id does not already exist in the database, this method
+        will have no effect (and will not report an error).
+
+        """
+        if self._index is None:
+            raise _errors.IndexerError("IndexerConnection has been closed")
+        self._index.delete_document('Q' + id)
+
+    def flush(self):
+        """Apply recent changes to the database.
+
+        If an exception occurs, any changes since the last call to flush() may
+        be lost.
+
+        """
+        if self._index is None:
+            raise _errors.IndexerError("IndexerConnection has been closed")
+        if self._config_modified:
+            self._store_config()
+        self._index.flush()
+
+    def close(self):
+        """Close the connection to the database.
+
+        It is important to call this method before allowing the class to be
+        garbage collected, because it will ensure that any un-flushed changes
+        will be flushed.  It also ensures that the connection is cleaned up
+        promptly.
+
+        No other methods may be called on the connection after this has been
+        called.  (It is permissible to call close() multiple times, but
+        only the first call will have any effect.)
+
+        If an exception occurs, the database will be closed, but changes since
+        the last call to flush may be lost.
+
+        """
+        if self._index is None:
+            return
+        try:
+            self.flush()
+        finally:
+            # There is currently no "close()" method for xapian databases, so
+            # we have to rely on the garbage collector.  Since we never copy
+            # the _index property out of this class, there should be no cycles,
+            # so the standard python implementation should garbage collect
+            # _index straight away.  A close() method is planned to be added to
+            # xapian at some point - when it is, we should call it here to make
+            # the code more robust.
+            self._index = None
+            self._indexpath = None
+            self._field_actions = None
+            self._config_modified = False
+
+    def get_doccount(self):
+        """Count the number of documents in the database.
+
+        This count will include documents which have been added or removed but
+        not yet flushed().
+
+        """
+        if self._index is None:
+            raise _errors.IndexerError("IndexerConnection has been closed")
+        return self._index.get_doccount()
+
+    def iterids(self):
+        """Get an iterator which returns all the ids in the database.
+
+        The unqiue_ids are currently returned in binary lexicographical sort
+        order, but this should not be relied on.
+
+        """
+        if self._index is None:
+            raise _errors.IndexerError("IndexerConnection has been closed")
+        return PrefixedTermIter('Q', self._index.allterms())
+
+    def get_document(self, id):
+        """Get the document with the specified unique ID.
+
+        Raises a KeyError if there is no such document.  Otherwise, it returns
+        a ProcessedDocument.
+
+        """
+        if self._index is None:
+            raise _errors.IndexerError("IndexerConnection has been closed")
+        postlist = self._index.postlist('Q' + id)
+        try:
+            plitem = postlist.next()
+        except StopIteration:
+            # Unique ID not found
+            raise KeyError('Unique ID %r not found' % id)
+        try:
+            postlist.next()
+            raise _errors.IndexerError("Multiple documents " #pragma: no cover
+                                       "found with same unique ID")
+        except StopIteration:
+            # Only one instance of the unique ID found, as it should be.
+            pass
+
+        result = ProcessedDocument(self._field_mappings)
+        result.id = id
+        result._doc = self._index.get_document(plitem.docid)
+        return result
+
+class PrefixedTermIter(object):
+    """Iterate through all the terms with a given prefix.
+
+    """
+    def __init__(self, prefix, termiter):
+        """Initialise the prefixed term iterator.
+
+        - `prefix` is the prefix to return terms for.
+        - `termiter` is a xapian TermIterator, which should be at it's start.
+
+        """
+
+        # The algorithm used in next() currently only works for single
+        # character prefixes, so assert that the prefix is single character.
+        # To deal with multicharacter prefixes, we need to check for terms
+        # which have a starting prefix equal to that given, but then have a
+        # following uppercase alphabetic character, indicating that the actual
+        # prefix is longer than the target prefix.  We then need to skip over
+        # these.  Not too hard to implement, but we don't need it yet.
+        assert(len(prefix) == 1)
+
+        self._started = False
+        self._prefix = prefix
+        self._prefixlen = len(prefix)
+        self._termiter = termiter
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        """Get the next term with the specified prefix.
+
+
+        """
+        if not self._started:
+            term = self._termiter.skip_to(self._prefix).term
+            self._started = True
+        else:
+            term = self._termiter.next().term
+        if len(term) < self._prefixlen or term[:self._prefixlen] != self._prefix:
+            raise StopIteration
+        return term[self._prefixlen:]
+
+if __name__ == '__main__':
+    import doctest, sys
+    doctest.testmod (sys.modules[__name__])
diff --git a/secore/marshall.py b/secore/marshall.py
new file mode 100644
index 0000000..ebcc71d
--- /dev/null
+++ b/secore/marshall.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2007 Lemur Consulting Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+r"""marshall.py: Marshal values into strings
+
+"""
+__docformat__ = "restructuredtext en"
+
+import math
+
+def _long_to_base256_array(value, length, flip):
+    result = []
+    for i in xrange(length):
+        n = value % 256
+        if flip: n = 255 - n
+        result.insert(0, chr(n))
+        value /= 256
+    return result
+
+def float_to_string(value):
+    """Marshall a floating point number to a string which sorts in the
+    appropriate manner.
+
+    """
+    mantissa, exponent = math.frexp(value)
+    sign = '1'
+    if mantissa < 0:
+        mantissa = -mantissa
+        sign = '0'
+
+    # IEEE representation of doubles uses 11 bits for the exponent, with a bias
+    # of 1023.  There's then another 52 bits in the mantissa, so we need to
+    # add 1075 to be sure that the exponent won't be negative.
+    # Even then, we check that the exponent isn't negative, and consider the
+    # value to be equal to zero if it is.
+    exponent += 1075
+    if exponent < 0: # Note - this can't happen on most architectures #pragma: no cover
+        exponent = 0
+        mantissa = 0
+    elif mantissa == 0:
+        exponent = 0
+
+    # IEEE representation of doubles uses 52 bits for the mantissa.  Convert it
+    # to a 7 character string, and convert the exponent to a 2 character
+    # string.
+
+    mantissa = long(mantissa * (2**52))
+
+    digits = [sign]
+    digits.extend(_long_to_base256_array(exponent, 2, sign == '0'))
+    digits.extend(_long_to_base256_array(mantissa, 7, sign == '0'))
+
+    return ''.join(digits)
+
+def date_to_string(date):
+    """Marshall a date to a string which sorts in the appropriate manner.
+
+    """
+    return '%04d%02d%02d' % (date.year, date.month, date.day)
diff --git a/secore/parsedate.py b/secore/parsedate.py
new file mode 100644
index 0000000..684d5f2
--- /dev/null
+++ b/secore/parsedate.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2007 Lemur Consulting Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+r"""parsedate.py: Parse date strings.
+
+"""
+__docformat__ = "restructuredtext en"
+
+import datetime
+import re
+
+yyyymmdd_re = re.compile(r'(?P<year>[0-9]{4})(?P<month>[0-9]{2})(?P<day>[0-9]{2})$')
+yyyy_mm_dd_re = re.compile(r'(?P<year>[0-9]{4})([-/.])(?P<month>[0-9]{2})\2(?P<day>[0-9]{2})$')
+
+def date_from_string(value):
+    """Parse a string into a date.
+
+    If the value supplied is already a date-like object (ie, has 'year',
+    'month' and 'day' attributes), it is returned without processing.
+
+    Supported date formats are:
+
+     - YYYYMMDD
+     - YYYY-MM-DD 
+     - YYYY/MM/DD 
+     - YYYY.MM.DD 
+
+    """
+    if (hasattr(value, 'year')
+        and hasattr(value, 'month')
+        and hasattr(value, 'day')):
+        return value
+
+    mg = yyyymmdd_re.match(value)
+    if mg is None:
+        mg = yyyy_mm_dd_re.match(value)
+
+    if mg is not None:
+        year, month, day = (int(i) for i in mg.group('year', 'month', 'day'))
+        return datetime.date(year, month, day)
+
+    raise ValueError('Unrecognised date format')
diff --git a/secore/searchconnection.py b/secore/searchconnection.py
new file mode 100644
index 0000000..79fa509
--- /dev/null
+++ b/secore/searchconnection.py
@@ -0,0 +1,618 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2007 Lemur Consulting Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+r"""searchconnection.py: A connection to the search engine for searching.
+
+"""
+__docformat__ = "restructuredtext en"
+
+import xapian as _xapian
+from datastructures import *
+from fieldactions import *
+import fieldmappings as _fieldmappings
+import highlight as _highlight 
+import errors as _errors
+import os as _os
+import cPickle as _cPickle
+
+class SearchResult(ProcessedDocument):
+    """A result from a search.
+
+    """
+    def __init__(self, msetitem, results):
+        ProcessedDocument.__init__(self, results._fieldmappings, msetitem.document)
+        self.rank = msetitem.rank
+        self._results = results
+
+    def _get_language(self, field):
+        """Get the language that should be used for a given field.
+
+        """
+        actions = self._results._conn._field_actions[field]._actions
+        for action, kwargslist in actions.iteritems():
+            if action == FieldActions.INDEX_FREETEXT:
+                for kwargs in kwargslist:
+                    try:
+                        return kwargs['language']
+                    except KeyError:
+                        pass
+        return 'none'
+
+    def summarise(self, field, maxlen=600, hl=('<b>', '</b>')):
+        """Return a summarised version of the field specified.
+
+        This will return a summary of the contents of the field stored in the
+        search result, with words which match the query highlighted.
+
+        The maximum length of the summary (in characters) may be set using the
+        maxlen parameter.
+
+        The return value will be a string holding the summary, with
+        highlighting applied.  If there are multiple instances of the field in
+        the document, the instances will be joined with a newline character.
+        
+        To turn off highlighting, set hl to None.  Each highlight will consist
+        of the first entry in the `hl` list being placed before the word, and
+        the second entry in the `hl` list being placed after the word.
+
+        Any XML or HTML style markup tags in the field will be stripped before
+        the summarisation algorithm is applied.
+
+        """
+        highlighter = _highlight.Highlighter(language_code=self._get_language(field))
+        field = self.data[field]
+        results = []
+        text = '\n'.join(field)
+        return highlighter.makeSample(text, self._results._query, maxlen, hl)
+
+    def highlight(self, field, hl=('<b>', '</b>'), strip_tags=False):
+        """Return a highlighted version of the field specified.
+
+        This will return all the contents of the field stored in the search
+        result, with words which match the query highlighted.
+
+        The return value will be a list of strings (corresponding to the list
+        of strings which is the raw field data).
+
+        Each highlight will consist of the first entry in the `hl` list being
+        placed before the word, and the second entry in the `hl` list being
+        placed after the word.
+
+        If `strip_tags` is True, any XML or HTML style markup tags in the field
+        will be stripped before highlighting is applied.
+
+        """
+        highlighter = _highlight.Highlighter(language_code=self._get_language(field))
+        field = self.data[field]
+        results = []
+        for text in field:
+            results.append(highlighter.highlight(text, self._results._query, hl, strip_tags))
+        return results
+
+    def __repr__(self):
+        return ('<SearchResult(rank=%d, id=%r, data=%r)>' %
+                (self.rank, self.id, self.data))
+
+
+class SearchResultIter(object):
+    """An iterator over a set of results from a search.
+
+    """
+    def __init__(self, results):
+        self._results = results
+        self._iter = iter(results._mset)
+
+    def next(self):
+        msetitem = self._iter.next()
+        return SearchResult(msetitem,
+                            self._results)
+
+
+class SearchResults(object):
+    """A set of results of a search.
+
+    """
+    def __init__(self, conn, enq, query, mset, fieldmappings):
+        self._conn = conn
+        self._enq = enq
+        self._query = query
+        self._mset = mset
+        self._fieldmappings = fieldmappings
+
+    def __repr__(self):
+        return ("<SearchResults(startrank=%d, "
+                "endrank=%d, "
+                "more_matches=%s, "
+                "matches_lower_bound=%d, "
+                "matches_upper_bound=%d, "
+                "matches_estimated=%d, "
+                "estimate_is_exact=%s)>" %
+                (
+                 self.startrank,
+                 self.endrank,
+                 self.more_matches,
+                 self.matches_lower_bound,
+                 self.matches_upper_bound,
+                 self.matches_estimated,
+                 self.estimate_is_exact,
+                ))
+
+    def _get_more_matches(self):
+        # This check relies on us having asked for at least one more result
+        # than retrieved to be checked.
+        return (self.matches_lower_bound > self.endrank)
+    more_matches = property(_get_more_matches, doc=
+    """Check whether there are further matches after those in this result set.
+
+    """)
+    def _get_startrank(self):
+        return self._mset.get_firstitem()
+    startrank = property(_get_startrank, doc=
+    """Get the rank of the first item in the search results.
+
+    This corresponds to the "startrank" parameter passed to the search() method.
+
+    """)
+    def _get_endrank(self):
+        return self._mset.get_firstitem() + len(self._mset)
+    endrank = property(_get_endrank, doc=
+    """Get the rank of the item after the end of the search results.
+
+    If there are sufficient results in the index, this corresponds to the
+    "endrank" parameter passed to the search() method.
+
+    """)
+    def _get_lower_bound(self):
+        return self._mset.get_matches_lower_bound()
+    matches_lower_bound = property(_get_lower_bound, doc=
+    """Get a lower bound on the total number of matching documents.
+
+    """)
+    def _get_upper_bound(self):
+        return self._mset.get_matches_upper_bound()
+    matches_upper_bound = property(_get_upper_bound, doc=
+    """Get an upper bound on the total number of matching documents.
+
+    """)
+    def _get_estimated(self):
+        return self._mset.get_matches_estimated()
+    matches_estimated = property(_get_estimated, doc=
+    """Get an estimate for the total number of matching documents.
+
+    """)
+    def _estimate_is_exact(self):
+        return self._mset.get_matches_lower_bound() == \
+               self._mset.get_matches_upper_bound()
+    estimate_is_exact = property(_estimate_is_exact, doc=
+    """Check whether the estimated number of matching documents is exact.
+
+    If this returns true, the estimate given by the `matches_estimated`
+    property is guaranteed to be correct.
+
+    If this returns false, it is possible that the actual number of matching
+    documents is different from the number given by the `matches_estimated`
+    property.
+
+    """)
+
+    def get_hit(self, index):
+        """Get the hit with a given index.
+
+        """
+        msetitem = self._mset.get_hit(index)
+        return SearchResult(msetitem, self)
+    __getitem__ = get_hit
+
+    def __iter__(self):
+        """Get an iterator over the hits in the search result.
+
+        The iterator returns the results in increasing order of rank.
+
+        """
+        return SearchResultIter(self)
+
+class SearchConnection(object):
+    """A connection to the search engine for searching.
+
+    The connection will access a view of the database.
+
+    """
+
+    def __init__(self, indexpath):
+        """Create a new connection to the index for searching.
+
+        There may only an arbitrary number of search connections for a
+        particular database open at a given time (regardless of whether there
+        is a connection for indexing open as well).
+
+        If the database doesn't exist, an exception will be raised.
+
+        """
+        self._index = _xapian.Database(indexpath)
+        self._indexpath = indexpath
+
+        # Read the actions.
+        self._load_config()
+
+    def _get_sort_type(self, field):
+        """Get the sort type that should be used for a given field.
+
+        """
+        actions = self._field_actions[field]._actions
+        for action, kwargslist in actions.iteritems():
+            if action == FieldActions.SORT_AND_COLLAPSE:
+                for kwargs in kwargslist:
+                    return kwargs['type']
+
+    def _load_config(self):
+        """Load the configuration for the database.
+
+        """
+        # Note: this code is basically duplicated in the IndexerConnection
+        # class.  Move it to a shared location.
+        config_file = _os.path.join(self._indexpath, 'config')
+        if not _os.path.exists(config_file):
+            self._field_mappings = _fieldmappings.FieldMappings()
+            return
+        fd = open(config_file)
+        config_str = fd.read()
+        fd.close()
+
+        (self._field_actions, mappings, next_docid) = _cPickle.loads(config_str)
+        self._field_mappings = _fieldmappings.FieldMappings(mappings)
+
+    def reopen(self):
+        """Reopen the connection.
+
+        This updates the revision of the index which the connection references
+        to the latest flushed revision.
+
+        """
+        if self._index is None:
+            raise _errors.SearchError("SearchConnection has been closed")
+        self._index.reopen()
+        # Re-read the actions.
+        self._load_config()
+        
+    def close(self):
+        """Close the connection to the database.
+
+        It is important to call this method before allowing the class to be
+        garbage collected to ensure that the connection is cleaned up promptly.
+
+        No other methods may be called on the connection after this has been
+        called.  (It is permissible to call close() multiple times, but
+        only the first call will have any effect.)
+
+        If an exception occurs, the database will be closed, but changes since
+        the last call to flush may be lost.
+
+        """
+        if self._index is None:
+            return
+        # There is currently no "close()" method for xapian databases, so
+        # we have to rely on the garbage collector.  Since we never copy
+        # the _index property out of this class, there should be no cycles,
+        # so the standard python implementation should garbage collect
+        # _index straight away.  A close() method is planned to be added to
+        # xapian at some point - when it is, we should call it here to make
+        # the code more robust.
+        self._index = None
+        self._indexpath = None
+        self._field_actions = None
+        self._field_mappings = None
+
+    def get_doccount(self):
+        """Count the number of documents in the database.
+
+        This count will include documents which have been added or removed but
+        not yet flushed().
+
+        """
+        if self._index is None:
+            raise _errors.SearchError("SearchConnection has been closed")
+        return self._index.get_doccount()
+
+    def get_document(self, id):
+        """Get the document with the specified unique ID.
+
+        Raises a KeyError if there is no such document.  Otherwise, it returns
+        a ProcessedDocument.
+
+        """
+        if self._index is None:
+            raise _errors.SearchError("SearchConnection has been closed")
+        postlist = self._index.postlist('Q' + id)
+        try:
+            plitem = postlist.next()
+        except StopIteration:
+            # Unique ID not found
+            raise KeyError('Unique ID %r not found' % id)
+        try:
+            postlist.next()
+            raise _errors.SearchError("Multiple documents " #pragma: no cover
+                                      "found with same unique ID")
+        except StopIteration:
+            # Only one instance of the unique ID found, as it should be.
+            pass
+
+        result = ProcessedDocument(self._field_mappings)
+        result.id = id
+        result._doc = self._index.get_document(plitem.docid)
+        return result
+
+    OP_AND = _xapian.Query.OP_AND
+    OP_OR = _xapian.Query.OP_OR
+    def query_composite(self, operator, queries):
+        """Build a composite query from a list of queries.
+
+        The queries are combined with the supplied operator, which is either
+        SearchConnection.OP_AND or SearchConnection.OP_OR.
+
+        """
+        if self._index is None:
+            raise _errors.SearchError("SearchConnection has been closed")
+        return _xapian.Query(operator, list(queries))
+
+    def query_filter(self, query, filter):
+        """Filter a query with another query.
+
+        Documents will only match the resulting query if they match both
+        queries, but will be weighted according to only the first query.
+
+        - `query`: The query to filter.
+        - `filter`: The filter to apply to the query.
+
+        """
+        if self._index is None:
+            raise _errors.SearchError("SearchConnection has been closed")
+        if not isinstance(filter, _xapian.Query):
+            raise _errors.SearchError("Filter must be a Xapian Query object")
+        return _xapian.Query(_xapian.Query.OP_FILTER, query, filter)
+
+    def query_range(self, field, begin, end):
+        """Create a query for a range search.
+        
+        This creates a query which matches only those documents which have a
+        field value in the specified range.
+
+        Begin and end must be appropriate values for the field, according to
+        the 'type' parameter supplied to the SORTABLE action for the field.
+
+        The begin and end values are both inclusive - any documents with a
+        value equal to begin or end will be returned (unless end is less than
+        begin, in which case no documents will be returned).
+
+        """
+        if self._index is None:
+            raise _errors.SearchError("SearchConnection has been closed")
+
+        sorttype = self._get_sort_type(field)
+        marshaller = SortableMarshaller(False)
+        fn = marshaller.get_marshall_function(field, sorttype)
+        begin = fn(field, begin)
+        end = fn(field, end)
+
+        slot = self._field_mappings.get_slot(field)
+        return _xapian.Query(_xapian.Query.OP_VALUE_RANGE, slot, begin, end)
+
+    def _prepare_queryparser(self, allow, deny, default_op):
+        """Prepare (and return) a query parser using the specified fields and
+        operator.
+
+        """
+        if self._index is None:
+            raise _errors.SearchError("SearchConnection has been closed")
+        if allow is not None and deny is not None:
+            raise _errors.SearchError("Cannot specify both `allow` and `deny`")
+        qp = _xapian.QueryParser()
+        qp.set_database(self._index)
+        qp.set_default_op(default_op)
+
+        if allow is None:
+            allow = [key for key in self._field_actions]
+        if deny is not None:
+            allow = [key for key in allow if key not in deny]
+
+        for field in allow:
+            actions = self._field_actions[field]._actions
+            for action, kwargslist in actions.iteritems():
+                if action == FieldActions.INDEX_EXACT:
+                    # FIXME - need patched version of xapian to add exact prefixes
+                    #qp.add_exact_prefix(field, self._field_mappings.get_prefix(field))
+                    qp.add_prefix(field, self._field_mappings.get_prefix(field))
+                if action == FieldActions.INDEX_FREETEXT:
+                    qp.add_prefix(field, self._field_mappings.get_prefix(field))
+                    for kwargs in kwargslist:
+                        try:
+                            lang = kwargs['language']
+                            qp.set_stemmer(_xapian.Stem(lang))
+                            qp.set_stemming_strategy(qp.STEM_SOME)
+                        except KeyError:
+                            pass
+        return qp
+
+    def query_parse(self, string, allow=None, deny=None, default_op=OP_AND):
+        """Parse a query string.
+
+        This is intended for parsing queries entered by a user.  If you wish to
+        combine structured queries, it is generally better to use the other
+        query building methods, such as `query_composite`.
+
+        - `string`: The string to parse.
+        - `allow`: A list of fields to allow in the query.
+        - `deny`: A list of fields not to allow in the query.
+
+        Only one of `allow` and `deny` may be specified.
+
+        If any of the entries in `allow` or `deny` are not present in the
+        configuration for the database, an exception will be raised.
+
+        Returns a Query object, which may be passed to the search() method, or
+        combined with other queries.
+
+        """
+        qp = self._prepare_queryparser(allow, deny, default_op)
+        try:
+            return qp.parse_query(string)
+        except _xapian.QueryParserError, e:
+            # If we got a parse error, retry without boolean operators (since
+            # these are the usual cause of the parse error).
+            return qp.parse_query(string, 0)
+
+    def query_field(self, field, value, default_op=OP_AND):
+        """A query for a single field.
+
+        """
+        if self._index is None:
+            raise _errors.SearchError("SearchConnection has been closed")
+        try:
+            actions = self._field_actions[field]._actions
+        except KeyError:
+            actions = {}
+
+        # need to check on field type, and stem / split as appropriate
+        for action, kwargslist in actions.iteritems():
+            if action == FieldActions.INDEX_EXACT:
+                prefix = self._field_mappings.get_prefix(field)
+                if len(value) > 0:
+                    chval = ord(value[0])
+                    if chval >= ord('A') and chval <= ord('Z'):
+                        prefix = prefix + ':'
+                return _xapian.Query(prefix + value)
+            if action == FieldActions.INDEX_FREETEXT:
+                qp = _xapian.QueryParser()
+                qp.set_default_op(default_op)
+                prefix = self._field_mappings.get_prefix(field)
+                for kwargs in kwargslist:
+                    try:
+                        lang = kwargs['language']
+                        qp.set_stemmer(_xapian.Stem(lang))
+                        qp.set_stemming_strategy(qp.STEM_SOME)
+                    except KeyError:
+                        pass
+                return qp.parse_query(value,
+                                      qp.FLAG_PHRASE | qp.FLAG_BOOLEAN | qp.FLAG_LOVEHATE,
+                                      prefix)
+
+        return _xapian.Query()
+
+    def query_all(self):
+        """A query which matches all the documents in the database.
+
+        """
+        return _xapian.Query('')
+
+    def spell_correct(self, string, allow=None, deny=None):
+        """Correct a query spelling.
+
+        This returns a version of the query string with any misspelt words
+        corrected.
+
+        - `allow`: A list of fields to allow in the query.
+        - `deny`: A list of fields not to allow in the query.
+
+        Only one of `allow` and `deny` may be specified.
+
+        If any of the entries in `allow` or `deny` are not present in the
+        configuration for the database, an exception will be raised.
+
+        """
+        qp = self._prepare_queryparser(allow, deny, self.OP_AND)
+        qp.parse_query(string, qp.FLAG_PHRASE|qp.FLAG_BOOLEAN|qp.FLAG_LOVEHATE|qp.FLAG_SPELLING_CORRECTION)
+        corrected = qp.get_corrected_query_string()
+        if len(corrected) == 0:
+            if isinstance(string, unicode):
+                # Encode as UTF-8 for consistency - this happens automatically
+                # to values passed to Xapian.
+                return string.encode('utf-8')
+            return string
+        return corrected
+
+    def search(self, query, startrank, endrank,
+               checkatleast=0, sortby=None, collapse=None):
+        """Perform a search, for documents matching a query.
+
+        - `query` is the query to perform.
+        - `startrank` is the rank of the start of the range of matching
+          documents to return (ie, the result with this rank will be returned).
+          ranks start at 0, which represents the "best" matching document.
+        - `endrank` is the rank at the end of the range of matching documents
+          to return.  This is exclusive, so the result with this rank will not
+          be returned.
+        - `checkatleast` is the minimum number of results to check for: the
+          estimate of the total number of matches will always be exact if
+          the number of matches is less than `checkatleast`.
+        - `sortby` is the name of a field to sort by.  It may be preceded by a
+          '+' or a '-' to indicate ascending or descending order
+          (respectively).  If the first character is neither '+' or '-', the
+          sort will be in ascending order.
+        - `collapse` is the name of a field to collapse the result documents
+          on.  If this is specified, there will be at most one result in the
+          result set for each value of the field.
+
+        """
+        if self._index is None:
+            raise _errors.SearchError("SearchConnection has been closed")
+        enq = _xapian.Enquire(self._index)
+        enq.set_query(query)
+
+        if sortby is not None:
+            asc = True
+            if sortby[0] == '-':
+                asc = False
+                sortby = sortby[1:]
+            elif sortby[0] == '+':
+                sortby = sortby[1:]
+
+            try:
+                slotnum = self._field_mappings.get_slot(sortby)
+            except KeyError:
+                raise _errors.SearchError("Field %r was not indexed for sorting" % sortby)
+
+            # Note: we invert the "asc" parameter, because xapian treats
+            # "ascending" as meaning "higher values are better"; in other
+            # words, it considers "ascending" to mean return results in
+            # descending order.
+            enq.set_sort_by_value_then_relevance(slotnum, not asc)
+
+        if collapse is not None:
+            try:
+                slotnum = self._field_mappings.get_slot(collapse)
+            except KeyError:
+                raise _errors.SearchError("Field %r was not indexed for collapsing" % collapse)
+            enq.set_collapse_key(slotnum)
+
+        maxitems = max(endrank - startrank, 0)
+        # Always check for at least one more result, so we can report whether
+        # there are more matches.
+        checkatleast = max(checkatleast, endrank + 1)
+
+        enq.set_docid_order(enq.DONT_CARE)
+
+        # Repeat the search until we don't get a DatabaseModifiedError
+        while True:
+            try:
+                mset = enq.get_mset(startrank, maxitems, checkatleast)
+                break
+            except _xapian.DatabaseModifiedError, e:
+                self.reopen()
+        return SearchResults(self, enq, query, mset, self._field_mappings)
+
+if __name__ == '__main__':
+    import doctest, sys
+    doctest.testmod (sys.modules[__name__])
author	Marco Pesenti Gritti <marco@localhost.localdomain>	2007-07-11 19:37:48 (GMT)
committer	Marco Pesenti Gritti <marco@localhost.localdomain>	2007-07-11 19:37:48 (GMT)
commit	3a3a2c361fbf670ee5375e669d34be386f6924f8 (patch)
tree	b436ec29c47fdb983e8355c31768a1e3d2b10a6c
parent	cb8a3f7e34b07a4d3fb3ebb3cb7eddceaec0e73d (diff)