Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMarco Pesenti Gritti <marco@localhost.localdomain>2007-07-11 19:37:48 (GMT)
committer Marco Pesenti Gritti <marco@localhost.localdomain>2007-07-11 19:37:48 (GMT)
commit3a3a2c361fbf670ee5375e669d34be386f6924f8 (patch)
treeb436ec29c47fdb983e8355c31768a1e3d2b10a6c
parentcb8a3f7e34b07a4d3fb3ebb3cb7eddceaec0e73d (diff)
Add secore. Cut and paste from http://flaxcode.googlecode.com/svn/trunk/libs/secore/secore/.
-rw-r--r--Makefile.am2
-rw-r--r--configure.ac1
-rw-r--r--secore/Makefile.am12
-rw-r--r--secore/__init__.py30
-rw-r--r--secore/datastructures.py216
-rw-r--r--secore/errors.py40
-rw-r--r--secore/fieldactions.py358
-rw-r--r--secore/fieldmappings.py123
-rw-r--r--secore/highlight.py310
-rw-r--r--secore/indexerconnection.py380
-rw-r--r--secore/marshall.py73
-rw-r--r--secore/parsedate.py56
-rw-r--r--secore/searchconnection.py618
13 files changed, 2218 insertions, 1 deletions
diff --git a/Makefile.am b/Makefile.am
index 8060aae..abf71cf 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,4 +1,4 @@
-SUBDIRS = bin etc src
+SUBDIRS = bin etc secore src
test:
@cd tests
diff --git a/configure.ac b/configure.ac
index 4824635..c60229a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -12,6 +12,7 @@ AC_OUTPUT([
Makefile
bin/Makefile
etc/Makefile
+secore/Makefile
src/Makefile
src/olpc/Makefile
src/olpc/datastore/Makefile
diff --git a/secore/Makefile.am b/secore/Makefile.am
new file mode 100644
index 0000000..393ba8f
--- /dev/null
+++ b/secore/Makefile.am
@@ -0,0 +1,12 @@
+datastoredir = $(pythondir)/secore
+datastore_PYTHON = \
+ __init__.py \
+ datastructures.py \
+ fieldmappings.py \
+ searchconnection.py \
+ errors.py \
+ highlight.py \
+ marshall.py \
+ fieldactions.py \
+ indexerconnection.py \
+ parsedate.py
diff --git a/secore/__init__.py b/secore/__init__.py
new file mode 100644
index 0000000..157fea4
--- /dev/null
+++ b/secore/__init__.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2007 Lemur Consulting Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+"""Search engine Core.
+
+See the accompanying documentation for details. In particular, there should be
+an accompanying file "introduction.html" (or "introduction.rst") which gives
+details of how to use the secore package.
+
+"""
+__docformat__ = "restructuredtext en"
+
+from datastructures import *
+from errors import *
+from indexerconnection import *
+from searchconnection import *
diff --git a/secore/datastructures.py b/secore/datastructures.py
new file mode 100644
index 0000000..414625d
--- /dev/null
+++ b/secore/datastructures.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2007 Lemur Consulting Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+r"""datastructures.py: Datastructures for search engine core.
+
+"""
+__docformat__ = "restructuredtext en"
+
+import xapian as _xapian
+import cPickle as _cPickle
+
+class Field(object):
+ # Use __slots__ because we're going to have very many Field objects in
+ # typical usage.
+ __slots__ = 'name', 'value'
+
+ def __init__(self, name, value):
+ self.name = name
+ self.value = value
+
+ def __repr__(self):
+ return 'Field(%r, %r)' % (self.name, self.value)
+
+class UnprocessedDocument(object):
+ """A unprocessed document to be passed to the indexer.
+
+ This represents an item to be processed and stored in the search engine.
+ Each document will be processed by the indexer to generate a
+ ProcessedDocument, which can then be stored in the search engine index.
+
+ Note that some information in an UnprocessedDocument will not be
+ represented in the ProcessedDocument: therefore, it is not possible to
+ retrieve an UnprocessedDocument from the search engine index.
+
+ An unprocessed document is a simple container with two attributes:
+
+ - `fields` is a list of Field objects.
+ - `id` is a string holding a unique identifier for the document (or
+ None to get the database to allocate a unique identifier automatically
+ when the document is added).
+
+ """
+
+ __slots__ = 'id', 'fields',
+ def __init__(self, id=None, fields=None):
+ self.id = id
+ if fields is None:
+ self.fields = []
+ else:
+ self.fields = fields
+
+ def __repr__(self):
+ return 'UnprocessedDocument(%r, %r)' % (self.id, self.fields)
+
+class ProcessedDocument(object):
+ """A processed document, as stored in the index.
+
+ This represents an item which is ready to be stored in the search engine,
+ or which has been returned by the search engine.
+
+ """
+
+ __slots__ = '_doc', '_fieldmappings', '_data',
+ def __init__(self, fieldmappings, xapdoc=None):
+ """Create a ProcessedDocument.
+
+ `fieldmappings` is the configuration from a database connection used lookup
+ the configuration to use to store each field.
+
+ If supplied, `xapdoc` is a Xapian document to store in the processed
+ document. Otherwise, a new Xapian document is created.
+
+ """
+ if xapdoc is None:
+ self._doc = _xapian.Document()
+ else:
+ self._doc = xapdoc
+ self._fieldmappings = fieldmappings
+ self._data = None
+
+ def add_term(self, field, term, wdfinc=1, positions=None):
+ """Add a term to the document.
+
+ Terms are the main unit of information used for performing searches.
+
+ - `field` is the field to add the term to.
+ - `term` is the term to add.
+ - `wdfinc` is the value to increase the within-document-frequency
+ measure for the term by.
+ - `positions` is the positional information to add for the term.
+ This may be None to indicate that there is no positional information,
+ or may be an integer to specify one position, or may be a sequence of
+ integers to specify several positions. (Note that the wdf is not
+ increased automatically for each position: if you add a term at 7
+ positions, and the wdfinc value is 2, the total wdf for the term will
+ only be increased by 2, not by 14.)
+
+ """
+ prefix = self._fieldmappings.get_prefix(field)
+ if len(term) > 0:
+ # We use the following check, rather than "isupper()" to ensure
+ # that we match the check performed by the queryparser, regardless
+ # of our locale.
+ if ord(term[0]) >= ord('A') and ord(term[0]) <= ord('Z'):
+ prefix = prefix + ':'
+ if positions is None:
+ self._doc.add_term(prefix + term, wdfinc)
+ elif isinstance(positions, int):
+ self._doc.add_posting(prefix + term, positions, wdfinc)
+ else:
+ self._doc.add_term(prefix + term, wdfinc)
+ for pos in positions:
+ self._doc.add_posting(prefix + term, pos, 0)
+
+ def add_value(self, field, value):
+ """Add a value to the document.
+
+ Values are additional units of information used when performing
+ searches. Note that values are _not_ intended to be used to store
+ information for display in the search results - use the document data
+ for that. The intention is that as little information as possible is
+ stored in values, so that they can be accessed as quickly as possible
+ during the search operation.
+
+ Unlike terms, each document may have at most one value in each field
+ (whereas there may be an arbitrary number of terms in a given field).
+ If an attempt to add multiple values to a single field is made, only
+ the last value added will be stored.
+
+ """
+ slot = self._fieldmappings.get_slot(field)
+ self._doc.add_value(slot, value)
+
+ def get_value(self, field):
+ """Get a value from the document.
+
+ """
+ slot = self._fieldmappings.get_slot(field)
+ return self._doc.get_value(slot)
+
+ def prepare(self):
+ """Prepare the document for adding to a xapian database.
+
+ This updates the internal xapian document with any changes which have
+ been made, and then returns it.
+
+ """
+ if self._data is not None:
+ self._doc.set_data(_cPickle.dumps(self._data, 2))
+ self._data = None
+ return self._doc
+
+ def _get_data(self):
+ if self._data is None:
+ rawdata = self._doc.get_data()
+ if rawdata == '':
+ self._data = {}
+ else:
+ self._data = _cPickle.loads(rawdata)
+ return self._data
+ def _set_data(self, data):
+ if not isinstance(data, dict):
+ raise TypeError("Cannot set data to any type other than a dict")
+ self._data = data
+ data = property(_get_data, _set_data, doc=
+ """The data stored in this processed document.
+
+ This data is a dictionary of entries, where the key is a fieldname, and the
+ value is a list of strings.
+
+ """)
+
+ def _get_id(self):
+ tl = self._doc.termlist()
+ try:
+ term = tl.skip_to('Q').term
+ if len(term) == 0 or term[0] != 'Q':
+ return None
+ except StopIteration:
+ return None
+ return term[1:]
+ def _set_id(self, id):
+ tl = self._doc.termlist()
+ try:
+ term = tl.skip_to('Q').term
+ except StopIteration:
+ term = ''
+ if len(term) != 0 and term[0] == 'Q':
+ self._doc.remove_term(term)
+ if id is not None:
+ self._doc.add_term('Q' + id, 0)
+ id = property(_get_id, _set_id, doc=
+ """The unique ID for this document.
+
+ """)
+
+ def __repr__(self):
+ return '<ProcessedDocument(%r)>' % (self.id)
+
+if __name__ == '__main__':
+ import doctest, sys
+ doctest.testmod (sys.modules[__name__])
diff --git a/secore/errors.py b/secore/errors.py
new file mode 100644
index 0000000..b6ad00f
--- /dev/null
+++ b/secore/errors.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2007 Lemur Consulting Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+r"""errors.py: Exceptions for the search engine core.
+
+"""
+__docformat__ = "restructuredtext en"
+
+class SearchEngineError(Exception):
+ r"""Base class for exceptions thrown by the search engine.
+
+ Any errors generated by the python level interface to xapian will be
+ instances of this class or its subclasses.
+
+ """
+
+class IndexerError(SearchEngineError):
+ r"""Class used to report errors from the indexing API.
+
+ """
+
+class SearchError(SearchEngineError):
+ r"""Class used to report errors from the search API.
+
+ """
+
diff --git a/secore/fieldactions.py b/secore/fieldactions.py
new file mode 100644
index 0000000..c595f0b
--- /dev/null
+++ b/secore/fieldactions.py
@@ -0,0 +1,358 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2007 Lemur Consulting Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+r"""fieldactions.py: Definitions and implementations of field actions.
+
+"""
+__docformat__ = "restructuredtext en"
+
+import errors as _errors
+import marshall as _marshall
+import xapian as _xapian
+import parsedate as _parsedate
+
+def _act_store_content(fieldname, doc, value, context):
+ """Perform the STORE_CONTENT action.
+
+ """
+ try:
+ fielddata = doc.data[fieldname]
+ except KeyError:
+ fielddata = []
+ doc.data[fieldname] = fielddata
+ fielddata.append(value)
+
+def _act_index_exact(fieldname, doc, value, context):
+ """Perform the INDEX_EXACT action.
+
+ """
+ doc.add_term(fieldname, value, 0)
+
+def _act_index_freetext(fieldname, doc, value, context, weight=1,
+ language=None, stop=None, spell=False,
+ nopos=False, noprefix=False):
+ """Perform the INDEX_FREETEXT action.
+
+ """
+ termgen = _xapian.TermGenerator()
+ if language is not None:
+ termgen.set_stemmer(_xapian.Stem(language))
+
+ if stop is not None:
+ stopper = _xapian.SimpleStopper()
+ for term in stop:
+ stopper.add (term)
+ termgen.set_stopper (stopper)
+
+ if spell:
+ termgen.set_database(context.index)
+ termgen.set_flags(termgen.FLAG_SPELLING)
+
+ termgen.set_document(doc._doc)
+ termgen.set_termpos(context.current_position)
+ if nopos:
+ termgen.index_text_without_positions(value, weight, '')
+ else:
+ termgen.index_text(value, weight, '')
+
+ if not noprefix:
+ # Store a second copy of the term with a prefix, for field-specific
+ # searches.
+ prefix = doc._fieldmappings.get_prefix(fieldname)
+ if len(prefix) != 0:
+ termgen.set_termpos(context.current_position)
+ if nopos:
+ termgen.index_text_without_positions(value, weight, prefix)
+ else:
+ termgen.index_text(value, weight, prefix)
+
+ # Add a gap between each field instance, so that phrase searches don't
+ # match across instances.
+ termgen.increase_termpos(10)
+ context.current_position = termgen.get_termpos()
+
+class SortableMarshaller(object):
+ """Implementation of marshalling for sortable values.
+
+ """
+ def __init__(self, indexing=True):
+ if indexing:
+ self._err = _errors.IndexerError
+ else:
+ self._err = _errors.SearchError
+
+ def marshall_string(self, fieldname, value):
+ """Marshall a value for sorting in lexicograpical order.
+
+ This returns the input as the output, since strings already sort in
+ lexicographical order.
+
+ """
+ return value
+
+ def marshall_float(self, fieldname, value):
+ """Marshall a value for sorting as a floating point value.
+
+ """
+ # convert the value to a float
+ try:
+ value = float(value)
+ except ValueError:
+ raise self._err("Value supplied to field %r must be a "
+ "valid floating point number: was %r" %
+ (fieldname, value))
+ return _marshall.float_to_string(value)
+
+ def marshall_date(self, fieldname, value):
+ """Marshall a value for sorting as a date.
+
+ """
+ try:
+ value = _parsedate.date_from_string(value)
+ except ValueError, e:
+ raise self._err("Value supplied to field %r must be a "
+ "valid date: was %r: error is '%s'" %
+ (fieldname, value, str(e)))
+ return _marshall.date_to_string(value)
+
+ def get_marshall_function(self, fieldname, sorttype):
+ """Get a function used to marshall values of a given sorttype.
+
+ """
+ try:
+ return {
+ None: self.marshall_string,
+ 'string': self.marshall_string,
+ 'float': self.marshall_float,
+ 'date': self.marshall_date,
+ }[sorttype]
+ except KeyError:
+ raise self._err("Unknown sort type %r for field %r" %
+ (sorttype, fieldname))
+
+
+def _act_sort_and_collapse(fieldname, doc, value, context, type=None):
+ """Perform the SORTABLE action.
+
+ """
+ marshaller = SortableMarshaller()
+ fn = marshaller.get_marshall_function(fieldname, type)
+ value = fn(fieldname, value)
+ doc.add_value(fieldname, value)
+
+class ActionContext(object):
+ """The context in which an action is performed.
+
+ This is just used to pass term generators, word positions, and the like
+ around.
+
+ """
+ def __init__(self, index):
+ self.current_language = None
+ self.current_position = 0
+ self.index = index
+
+class FieldActions(object):
+ """An object describing the actions to be performed on a field.
+
+ The supported actions are:
+
+ - `STORE_CONTENT`: store the unprocessed content of the field in the search
+ engine database. All fields which need to be displayed or used when
+ displaying the search results need to be given this action.
+
+ - `INDEX_EXACT`: index the exact content of the field as a single search
+ term. Fields whose contents need to be searchable as an "exact match"
+ need to be given this action.
+
+ - `INDEX_FREETEXT`: index the content of this field as text. The content
+ will be split into terms, allowing free text searching of the field. Four
+ optional parameters may be supplied:
+
+ - 'weight' is a multiplier to apply to the importance of the field. This
+ must be an integer, and the default value is 1.
+ - 'language' is the language to use when processing the field. This can
+ be expressed as an ISO 2-letter language code. The supported languages
+ are those supported by the xapian core in use.
+ - 'stop' is an iterable of stopwords to filter out of the generated
+ terms. Note that due to Xapian design, only non-positional terms are
+ affected, so this is of limited use.
+ - 'spell' is a boolean flag - if true, the contents of the field will be
+ used for spelling correction.
+ - 'nopos' is a boolean flag - if true, positional information is not
+ stored.
+ - 'noprefix' is a boolean flag - if true, prevents terms with the field
+ prefix being generated. This means that searches specific to this
+ field will not work, and thus should only be used for special cases.
+
+ - `SORTABLE`: index the content of the field such that it can be used to
+ sort result sets. It also allows result sets to be restricted to those
+ documents with a field values in a given range. One optional parameter
+ may be supplied:
+
+ - 'type' is a value indicating how to sort the field. It has several
+ possible values:
+
+ - 'string' - sort in lexicographic (ie, alphabetical) order.
+ This is the default, used if no type is set.
+ - 'float' - treat the values as (decimal representations of) floating
+ point numbers, and sort in numerical order . The values in the field
+ must be valid floating point numbers (according to Python's float()
+ function).
+ - 'date' - sort in date order. The values must be valid dates (either
+ Python datetime.date objects, or ISO 8601 format (ie, YYYYMMDD or
+ YYYY-MM-DD).
+
+ - `COLLAPSE`: index the content of the field such that it can be used to
+ "collapse" result sets, such that only the highest result with each value
+ of the field will be returned.
+
+ """
+
+ # See the class docstring for the meanings of the following constants.
+ STORE_CONTENT = 1
+ INDEX_EXACT = 2
+ INDEX_FREETEXT = 3
+ SORTABLE = 4
+ COLLAPSE = 5
+
+ # Sorting and collapsing store the data in a value, but the format depends
+ # on the sort type. Easiest way to implement is to treat them as the same
+ # action.
+ SORT_AND_COLLAPSE = -1
+
+ # NEED_SLOT is a flag used to indicate that an action needs a slot number
+ NEED_SLOT = 1
+ # NEED_PREFIX is a flag used to indicate that an action needs a prefix
+ NEED_PREFIX = 2
+
+ def __init__(self, fieldname):
+ # Dictionary of actions, keyed by type.
+ self._actions = {}
+ self._fieldname = fieldname
+
+ def add(self, field_mappings, action, **kwargs):
+ """Add an action to perform on a field.
+
+ """
+ if action not in (FieldActions.STORE_CONTENT,
+ FieldActions.INDEX_EXACT,
+ FieldActions.INDEX_FREETEXT,
+ FieldActions.SORTABLE,
+ FieldActions.COLLAPSE,):
+ raise _errors.IndexerError("Unknown field action: %r" % action)
+
+ info = self._action_info[action]
+
+ # Check parameter names
+ for key in kwargs.keys():
+ if key not in info[1]:
+ raise _errors.IndexerError("Unknown parameter name for action %r: %r" % (info[0], key))
+
+ # Fields cannot be indexed both with "EXACT" and "FREETEXT": whilst we
+ # could implement this, the query parser wouldn't know what to do with
+ # searches.
+ if action == FieldActions.INDEX_EXACT:
+ if FieldActions.INDEX_FREETEXT in self._actions:
+ raise _errors.IndexerError("Field %r is already marked for indexing "
+ "as free text: cannot mark for indexing "
+ "as exact text as well" % self._fieldname)
+ if action == FieldActions.INDEX_FREETEXT:
+ if FieldActions.INDEX_EXACT in self._actions:
+ raise _errors.IndexerError("Field %r is already marked for indexing "
+ "as exact text: cannot mark for indexing "
+ "as free text as well" % self._fieldname)
+
+ # Fields cannot be indexed as more than one type for "SORTABLE": to
+ # implement this, we'd need to use a different prefix for each sortable
+ # type, but even then the search end wouldn't know what to sort on when
+ # searching. Also, if they're indexed as "COLLAPSE", the value must be
+ # stored in the right format for the type "SORTABLE".
+ if action == FieldActions.SORTABLE or action == FieldActions.COLLAPSE:
+ if action == FieldActions.COLLAPSE:
+ sorttype = None
+ else:
+ try:
+ sorttype = kwargs['type']
+ except KeyError:
+ sorttype = 'string'
+ kwargs['type'] = sorttype
+ action = FieldActions.SORT_AND_COLLAPSE
+
+ try:
+ oldsortactions = self._actions[FieldActions.SORT_AND_COLLAPSE]
+ except KeyError:
+ oldsortactions = ()
+
+ if len(oldsortactions) > 0:
+ for oldsortaction in oldsortactions:
+ oldsorttype = oldsortaction['type']
+
+ if sorttype == oldsorttype or oldsorttype is None:
+ # Use new type
+ self._actions[action] = []
+ elif sorttype is None:
+ # Use old type
+ return
+ else:
+ raise _errors.IndexerError("Field %r is already marked for "
+ "sorting, with a different "
+ "sort type" % self._fieldname)
+
+ if self.NEED_PREFIX in info[3]:
+ field_mappings.add_prefix(self._fieldname)
+ if self.NEED_SLOT in info[3]:
+ field_mappings.add_slot(self._fieldname)
+
+ # Make an entry for the action
+ if action not in self._actions:
+ self._actions[action] = []
+
+ # Check for repetitions of actions
+ for old_action in self._actions[action]:
+ if old_action == kwargs:
+ return
+
+ # Append the action to the list of actions
+ self._actions[action].append(kwargs)
+
+ def perform(self, doc, value, context):
+ """Perform the actions on the field.
+
+ - `doc` is a ProcessedDocument to store the result of the actions in.
+ - `value` is a string holding the value of the field.
+ - `context` is an ActionContext object used to keep state in.
+
+ """
+ for type, actionlist in self._actions.iteritems():
+ info = self._action_info[type]
+ for kwargs in actionlist:
+ info[2](self._fieldname, doc, value, context, **kwargs)
+
+ _action_info = {
+ STORE_CONTENT: ('STORE_CONTENT', (), _act_store_content, (), ),
+ INDEX_EXACT: ('INDEX_EXACT', (), _act_index_exact, (NEED_PREFIX,), ),
+ INDEX_FREETEXT: ('INDEX_FREETEXT', ('weight', 'language', 'stop', 'spell', 'nopos', 'noprefix', ),
+ _act_index_freetext, (NEED_PREFIX, ), ),
+ SORTABLE: ('SORTABLE', ('type', ), None, (NEED_SLOT,), ),
+ COLLAPSE: ('COLLAPSE', (), None, (NEED_SLOT,), ),
+ SORT_AND_COLLAPSE: ('SORT_AND_COLLAPSE', ('type', ), _act_sort_and_collapse, (NEED_SLOT,), ),
+ }
+
+if __name__ == '__main__':
+ import doctest, sys
+ doctest.testmod (sys.modules[__name__])
diff --git a/secore/fieldmappings.py b/secore/fieldmappings.py
new file mode 100644
index 0000000..3838ce5
--- /dev/null
+++ b/secore/fieldmappings.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2007 Lemur Consulting Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+r"""fieldmappings.py: Mappings from field names to term prefixes, etc.
+
+"""
+__docformat__ = "restructuredtext en"
+
+import cPickle as _cPickle
+
+class FieldMappings(object):
+ """Mappings from field names to term prefixes, slot values, etc.
+
+ The following mappings are maintained:
+
+ - a mapping from field name to the string prefix to insert at the start of
+ terms.
+ - a mapping from field name to the slot numbers to store the field contents
+ in.
+
+ """
+ __slots__ = '_prefixes', '_prefixcount', '_slots', '_slotcount',
+
+ def __init__(self, serialised=None):
+ """Create a new field mapping object, or unserialise a saved one.
+
+ """
+ if serialised is not None:
+ (self._prefixes, self._prefixcount,
+ self._slots, self._slotcount) = _cPickle.loads(serialised)
+ else:
+ self._prefixes = {}
+ self._prefixcount = 0
+ self._slots = {}
+ self._slotcount = 0
+
+ def _genPrefix(self):
+ """Generate a previously unused prefix.
+
+ Prefixes are uppercase letters, and start with 'X' (this is a Xapian
+ convention, for compatibility with other Xapian tools: other starting
+ letters are reserved for special meanings):
+
+ >>> maps = FieldMappings()
+ >>> maps._genPrefix()
+ 'XA'
+ >>> maps._genPrefix()
+ 'XB'
+ >>> [maps._genPrefix() for i in xrange(60)]
+ ['XC', 'XD', 'XE', 'XF', 'XG', 'XH', 'XI', 'XJ', 'XK', 'XL', 'XM', 'XN', 'XO', 'XP', 'XQ', 'XR', 'XS', 'XT', 'XU', 'XV', 'XW', 'XX', 'XY', 'XZ', 'XAA', 'XBA', 'XCA', 'XDA', 'XEA', 'XFA', 'XGA', 'XHA', 'XIA', 'XJA', 'XKA', 'XLA', 'XMA', 'XNA', 'XOA', 'XPA', 'XQA', 'XRA', 'XSA', 'XTA', 'XUA', 'XVA', 'XWA', 'XXA', 'XYA', 'XZA', 'XAB', 'XBB', 'XCB', 'XDB', 'XEB', 'XFB', 'XGB', 'XHB', 'XIB', 'XJB']
+ >>> maps = FieldMappings()
+ >>> [maps._genPrefix() for i in xrange(27*26 + 5)][-10:]
+ ['XVZ', 'XWZ', 'XXZ', 'XYZ', 'XZZ', 'XAAA', 'XBAA', 'XCAA', 'XDAA', 'XEAA']
+ """
+ res = []
+ self._prefixcount += 1
+ num = self._prefixcount
+ while num != 0:
+ ch = (num - 1) % 26
+ res.append(chr(ch + ord('A')))
+ num -= ch
+ num = num // 26
+ return 'X' + ''.join(res)
+
+ def get_prefix(self, fieldname):
+ """Get the prefix used for a given field name.
+
+ """
+ return self._prefixes[fieldname]
+
+ def get_slot(self, fieldname):
+ """Get the slot number used for a given field name.
+
+ """
+ return self._slots[fieldname]
+
+ def add_prefix(self, fieldname):
+ """Allocate a prefix for the given field.
+
+ If a prefix is already allocated for this field, this has no effect.
+
+ """
+ if fieldname in self._prefixes:
+ return
+ self._prefixes[fieldname] = self._genPrefix()
+
+ def add_slot(self, fieldname):
+ """Allocate a slot number for the given field.
+
+ If a slot number is already allocated for this field, this has no effect.
+
+ """
+ if fieldname in self._slots:
+ return
+ self._slots[fieldname] = self._slotcount
+ self._slotcount += 1
+
+ def serialise(self):
+ """Serialise the field mappings to a string.
+
+ This can be unserialised by passing the result of this method to the
+ constructor of a new FieldMappings object.
+
+ """
+ return _cPickle.dumps((self._prefixes,
+ self._prefixcount,
+ self._slots,
+ self._slotcount,
+ ), 2)
diff --git a/secore/highlight.py b/secore/highlight.py
new file mode 100644
index 0000000..38f2050
--- /dev/null
+++ b/secore/highlight.py
@@ -0,0 +1,310 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2007 Lemur Consulting Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+r"""highlight.py: Highlight and summarise text.
+
+"""
+__docformat__ = "restructuredtext en"
+
+import re
+import xapian
+
+class Highlighter(object):
+ """Class for highlighting text and creating contextual summaries.
+
+ >>> hl = Highlighter("en")
+ >>> hl.makeSample('Hello world.', ['world'])
+ 'Hello world.'
+ >>> hl.highlight('Hello world', ['world'], ('<', '>'))
+ 'Hello <world>'
+
+ """
+
+ # split string into words, spaces, punctuation and markup tags
+ _split_re = re.compile(
+ '</\\w+>|<\\w+(?:\\s*\\w+="[^"]*"|\\s*\\w+)*\\s*>|[\\w\']+|\\s+|[^\\w\'\\s<>/]+')
+
+ def __init__(self, language_code='en', stemmer=None):
+ """Create a new highlighter for the specified language.
+
+ """
+ if stemmer is not None:
+ self.stem = stemmer
+ else:
+ self.stem = xapian.Stem(language_code)
+
+ def _split_text(self, text, strip_tags=False):
+ """Split some text into words and non-words.
+
+ - `text` is the text to process. It may be a unicode object or a utf-8
+ encoded simple string.
+ - `strip_tags` is a flag - False to keep tags, True to strip all tags
+ from the output.
+
+ Returns a list of utf-8 encoded simple strings.
+
+ """
+ if isinstance(text, unicode):
+ text = text.encode('utf-8')
+
+ words = self._split_re.findall(text)
+ if strip_tags:
+ return [w for w in words if w[0] != '<']
+ else:
+ return words
+
+ def _strip_prefix(self, term):
+ """Strip the prefix off a term.
+
+ Prefixes are any initial capital letters, with the exception that R always
+ ends a prefix, even if followed by capital letters.
+
+ >>> hl = Highlighter("en")
+ >>> print hl._strip_prefix('hello')
+ hello
+ >>> print hl._strip_prefix('Rhello')
+ hello
+ >>> print hl._strip_prefix('XARHello')
+ Hello
+ >>> print hl._strip_prefix('XAhello')
+ hello
+ >>> print hl._strip_prefix('XAh')
+ h
+ >>> print hl._strip_prefix('XA')
+ <BLANKLINE>
+
+ """
+ for p in xrange(len(term)):
+ if term[p].islower():
+ return term[p:]
+ elif term[p] == 'R':
+ return term[p+1:]
+ return ''
+
+ def _query_to_stemmed_words(self, query):
+ """Convert a query to a list of stemmed words.
+
+ - `query` is the query to parse: it may be xapian.Query object, or a
+ sequence of terms.
+
+ """
+ if isinstance(query, xapian.Query):
+ return [self._strip_prefix(t) for t in query]
+ else:
+ return [self.stem(q.lower()) for q in query]
+
+
+ def makeSample(self, text, query, maxlen=600, hl=None):
+ """Make a contextual summary from the supplied text.
+
+ This basically works by splitting the text into phrases, counting the query
+ terms in each, and keeping those with the most.
+
+ Any markup tags in the text will be stripped.
+
+ `text` is the source text to summarise.
+ `query` is either a Xapian query object or a list of (unstemmed) term strings.
+ `maxlen` is the maximum length of the generated summary.
+ `hl` is a pair of strings to insert around highlighted terms, e.g. ('<b>', '</b>')
+
+ """
+
+ words = self._split_text(text, True)
+ terms = self._query_to_stemmed_words(query)
+
+ # build blocks delimited by puncuation, and count matching words in each block
+ # blocks[n] is a block [firstword, endword, charcount, termcount, selected]
+ blocks = []
+ start = end = count = blockchars = 0
+
+ while end < len(words):
+ blockchars += len(words[end])
+ if words[end].isalnum():
+ if self.stem(words[end].lower()) in terms:
+ count += 1
+ end += 1
+ elif words[end] in ',.;:?!\n':
+ end += 1
+ blocks.append([start, end, blockchars, count, False])
+ start = end
+ blockchars = 0
+ count = 0
+ else:
+ end += 1
+ if start != end:
+ blocks.append([start, end, blockchars, count, False])
+ if len(blocks) == 0:
+ return ''
+
+ # select high-scoring blocks first, down to zero-scoring
+ chars = 0
+ for count in xrange(3, -1, -1):
+ for b in blocks:
+ if b[3] >= count:
+ b[4] = True
+ chars += b[2]
+ if chars >= maxlen: break
+ if chars >= maxlen: break
+
+ # assemble summary
+ words2 = []
+ lastblock = -1
+ for i, b in enumerate(blocks):
+ if b[4]:
+ if i != lastblock + 1:
+ words2.append('..')
+ words2.extend(words[b[0]:b[1]])
+ lastblock = i
+
+ if not blocks[-1][4]:
+ words2.append('..')
+
+ # trim down to maxlen
+ l = 0
+ for i in xrange (len (words2)):
+ l += len (words2[i])
+ if l >= maxlen:
+ words2[i:] = ['..']
+ break
+
+ if hl is None:
+ return ''.join(words2)
+ else:
+ return self._hl(words2, terms, hl)
+
+ def highlight(self, text, query, hl, strip_tags=False):
+ """Add highlights (string prefix/postfix) to a string.
+
+ `text` is the source to highlight.
+ `query` is either a Xapian query object or a list of (unstemmed) term strings.
+ `hl` is a pair of highlight strings, e.g. ('<i>', '</i>')
+ `strip_tags` strips HTML markout iff True
+
+ >>> hl = Highlighter()
+ >>> qp = xapian.QueryParser()
+ >>> q = qp.parse_query('cat dog')
+ >>> tags = ('[[', ']]')
+ >>> hl.highlight('The cat went Dogging; but was <i>dog tired</i>.', q, tags)
+ 'The [[cat]] went [[Dogging]]; but was <i>[[dog]] tired</i>.'
+
+ """
+ words = self._split_text(text, strip_tags)
+ terms = self._query_to_stemmed_words(query)
+ return self._hl(words, terms, hl)
+
+ def _hl(self, words, terms, hl):
+ """Add highlights to a list of words.
+
+ `words` is the list of words and non-words to be highlighted..
+ `terms` is the list of stemmed words to look for.
+
+ """
+ for i, w in enumerate(words):
+ if self.stem(words[i].lower()) in terms:
+ words[i] = ''.join((hl[0], w, hl[1]))
+
+ return ''.join(words)
+
+
+__test__ = {
+ 'no_punc': r'''
+
+ Test the highlighter's behaviour when there is no punctuation in the sample
+ text (regression test - used to return no output):
+ >>> hl = Highlighter("en")
+ >>> hl.makeSample('Hello world', ['world'])
+ 'Hello world'
+
+ ''',
+
+ 'stem_levels': r'''
+
+ Test highlighting of words, and how it works with stemming:
+ >>> hl = Highlighter("en")
+
+ # "word" and "wording" stem to "word", so the following 4 calls all return
+ # the same thing
+ >>> hl.makeSample('Hello. word. wording. wordinging.', ['word'], hl='<>')
+ 'Hello. <word>. <wording>. wordinging.'
+ >>> hl.highlight('Hello. word. wording. wordinging.', ['word'], '<>')
+ 'Hello. <word>. <wording>. wordinging.'
+ >>> hl.makeSample('Hello. word. wording. wordinging.', ['wording'], hl='<>')
+ 'Hello. <word>. <wording>. wordinging.'
+ >>> hl.highlight('Hello. word. wording. wordinging.', ['wording'], '<>')
+ 'Hello. <word>. <wording>. wordinging.'
+
+ # "wordinging" stems to "wording", so only the last word is highlighted for
+ # this one.
+ >>> hl.makeSample('Hello. word. wording. wordinging.', ['wordinging'], hl='<>')
+ 'Hello. word. wording. <wordinging>.'
+ >>> hl.highlight('Hello. word. wording. wordinging.', ['wordinging'], '<>')
+ 'Hello. word. wording. <wordinging>.'
+ ''',
+
+ 'supplied_stemmer': r'''
+
+ Test behaviour if we pass in our own stemmer:
+ >>> stem = xapian.Stem('en')
+ >>> hl = Highlighter(stemmer=stem)
+ >>> hl.highlight('Hello. word. wording. wordinging.', ['word'], '<>')
+ 'Hello. <word>. <wording>. wordinging.'
+
+ ''',
+
+ 'unicode': r'''
+
+ Test behaviour if we pass in unicode input:
+ >>> hl = Highlighter('en')
+ >>> hl.highlight(u'Hello\xf3. word. wording. wordinging.', ['word'], '<>')
+ 'Hello\xc3\xb3. <word>. <wording>. wordinging.'
+
+ ''',
+
+ 'no_sample': r'''
+
+ Test behaviour if we pass in unicode input:
+ >>> hl = Highlighter('en')
+ >>> hl.makeSample(u'', ['word'])
+ ''
+
+ ''',
+
+ 'short_samples': r'''
+
+ >>> hl = Highlighter('en')
+ >>> hl.makeSample("A boring start. Hello world indeed. A boring end.", ['hello'], 20, ('<', '>'))
+ '.. <Hello> world ..'
+ >>> hl.makeSample("A boring start. Hello world indeed. A boring end.", ['hello'], 40, ('<', '>'))
+ 'A boring start. <Hello> world indeed...'
+ >>> hl.makeSample("A boring start. Hello world indeed. A boring end.", ['boring'], 40, ('<', '>'))
+ 'A <boring> start... A <boring> end.'
+
+ ''',
+
+ 'apostrophes': r'''
+
+ >>> hl = Highlighter('en')
+ >>> hl.makeSample("A boring start. Hello world's indeed. A boring end.", ['world'], 40, ('<', '>'))
+ "A boring start. Hello <world's> indeed..."
+
+ ''',
+
+}
+
+if __name__ == '__main__':
+ import doctest, sys
+ doctest.testmod (sys.modules[__name__])
diff --git a/secore/indexerconnection.py b/secore/indexerconnection.py
new file mode 100644
index 0000000..be82319
--- /dev/null
+++ b/secore/indexerconnection.py
@@ -0,0 +1,380 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2007 Lemur Consulting Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+r"""indexerconnection.py: A connection to the search engine for indexing.
+
+"""
+__docformat__ = "restructuredtext en"
+
+import xapian as _xapian
+from datastructures import *
+from fieldactions import *
+import fieldmappings as _fieldmappings
+import errors as _errors
+import os as _os
+import cPickle as _cPickle
+
+class IndexerConnection(object):
+ """A connection to the search engine for indexing.
+
+ """
+
+ def __init__(self, indexpath):
+ """Create a new connection to the index.
+
+ There may only be one indexer connection for a particular database open
+ at a given time. Therefore, if a connection to the database is already
+ open, this will raise a xapian.DatabaseLockError.
+
+ If the database doesn't already exist, it will be created.
+
+ """
+ self._index = _xapian.WritableDatabase(indexpath, _xapian.DB_CREATE_OR_OPEN)
+ self._indexpath = indexpath
+
+ # Read existing actions.
+ self._field_actions = {}
+ self._field_mappings = _fieldmappings.FieldMappings()
+ self._next_docid = 0
+ self._config_modified = False
+ self._load_config()
+
+ def _store_config(self):
+ """Store the configuration for the database.
+
+ Currently, this stores the configuration in a file in the database
+ directory, so changes to it are not protected by transactions. When
+ support is available in xapian for storing metadata associated with
+ databases. this will be used instead of a file.
+
+ """
+ config_str = _cPickle.dumps((
+ self._field_actions,
+ self._field_mappings.serialise(),
+ self._next_docid,
+ ), 2)
+ config_file = _os.path.join(self._indexpath, 'config')
+ fd = open(config_file, "w")
+ fd.write(config_str)
+ fd.close()
+ self._config_modified = False
+
+ def _load_config(self):
+ """Load the configuration for the database.
+
+ """
+ config_file = _os.path.join(self._indexpath, 'config')
+ if not _os.path.exists(config_file):
+ return
+ fd = open(config_file)
+ config_str = fd.read()
+ fd.close()
+
+ (self._field_actions, mappings, self._next_docid) = _cPickle.loads(config_str)
+ self._field_mappings = _fieldmappings.FieldMappings(mappings)
+ self._config_modified = False
+
+ def _allocate_id(self):
+ """Allocate a new ID.
+
+ """
+ while True:
+ idstr = "%x" % self._next_docid
+ self._next_docid += 1
+ if not self._index.term_exists('Q' + idstr):
+ break
+ self._config_modified = True
+ return idstr
+
+ def add_field_action(self, fieldname, fieldtype, **kwargs):
+ """Add an action to be performed on a field.
+
+ Note that this change to the configuration will not be preserved on
+ disk until the next call to flush().
+
+ """
+ if self._index is None:
+ raise _errors.IndexerError("IndexerConnection has been closed")
+ if fieldname in self._field_actions:
+ actions = self._field_actions[fieldname]
+ else:
+ actions = FieldActions(fieldname)
+ self._field_actions[fieldname] = actions
+ actions.add(self._field_mappings, fieldtype, **kwargs)
+ self._config_modified = True
+
+ def clear_field_actions(self, fieldname):
+ """Clear all actions for the specified field.
+
+ This does not report an error if there are already no actions for the
+ specified field.
+
+ Note that this change to the configuration will not be preserved on
+ disk until the next call to flush().
+
+ """
+ if self._index is None:
+ raise _errors.IndexerError("IndexerConnection has been closed")
+ if fieldname in self._field_actions:
+ del self._field_actions[fieldname]
+ self._config_modified = True
+
+ def process(self, document):
+ """Process an UnprocessedDocument with the settings in this database.
+
+ The resulting ProcessedDocument is returned.
+
+ Note that this processing will be automatically performed if an
+ UnprocessedDocument is supplied to the add() or replace() methods of
+ IndexerConnection. This method is exposed to allow the processing to
+ be performed separately, which may be desirable if you wish to manually
+ modify the processed document before adding it to the database, or if
+ you want to split processing of documents from adding documents to the
+ database for performance reasons.
+
+ """
+ if self._index is None:
+ raise _errors.IndexerError("IndexerConnection has been closed")
+ result = ProcessedDocument(self._field_mappings)
+ result.id = document.id
+ context = ActionContext(self._index)
+
+ for field in document.fields:
+ try:
+ actions = self._field_actions[field.name]
+ except KeyError:
+ # If no actions are defined, just ignore the field.
+ continue
+ actions.perform(result, field.value, context)
+
+ return result
+
+ def add(self, document):
+ """Add a new document to the search engine index.
+
+ If the document has a id set, and the id already exists in
+ the database, an exception will be raised. Use the replace() method
+ instead if you wish to overwrite documents.
+
+ Returns the id of the newly added document (making up a new
+ unique ID if no id was set).
+
+ The supplied document may be an instance of UnprocessedDocument, or an
+ instance of ProcessedDocument.
+
+ """
+ if self._index is None:
+ raise _errors.IndexerError("IndexerConnection has been closed")
+ if not hasattr(document, '_doc'):
+ # It's not a processed document.
+ document = self.process(document)
+
+ # Ensure that we have a id
+ orig_id = document.id
+ if orig_id is None:
+ id = self._allocate_id()
+ document.id = id
+ else:
+ id = orig_id
+ if self._index.term_exists('Q' + id):
+ raise _errors.IndexerError("Document ID of document supplied to add() is not unique.")
+
+ # Add the document.
+ xapdoc = document.prepare()
+ self._index.add_document(xapdoc)
+
+ if id is not orig_id:
+ document.id = orig_id
+ return id
+
+ def replace(self, document):
+ """Replace a document in the search engine index.
+
+ If the document does not have a id set, an exception will be
+ raised.
+
+ If the document has a id set, and the id does not already
+ exist in the database, this method will have the same effect as add().
+
+ """
+ if self._index is None:
+ raise _errors.IndexerError("IndexerConnection has been closed")
+ if not hasattr(document, '_doc'):
+ # It's not a processed document.
+ document = self.process(document)
+
+ # Ensure that we have a id
+ id = document.id
+ if id is None:
+ raise _errors.IndexerError("No document ID set for document supplied to replace().")
+
+ xapdoc = document.prepare()
+ self._index.replace_document('Q' + id, xapdoc)
+
+ def delete(self, id):
+ """Delete a document from the search engine index.
+
+ If the id does not already exist in the database, this method
+ will have no effect (and will not report an error).
+
+ """
+ if self._index is None:
+ raise _errors.IndexerError("IndexerConnection has been closed")
+ self._index.delete_document('Q' + id)
+
+ def flush(self):
+ """Apply recent changes to the database.
+
+ If an exception occurs, any changes since the last call to flush() may
+ be lost.
+
+ """
+ if self._index is None:
+ raise _errors.IndexerError("IndexerConnection has been closed")
+ if self._config_modified:
+ self._store_config()
+ self._index.flush()
+
+ def close(self):
+ """Close the connection to the database.
+
+ It is important to call this method before allowing the class to be
+ garbage collected, because it will ensure that any un-flushed changes
+ will be flushed. It also ensures that the connection is cleaned up
+ promptly.
+
+ No other methods may be called on the connection after this has been
+ called. (It is permissible to call close() multiple times, but
+ only the first call will have any effect.)
+
+ If an exception occurs, the database will be closed, but changes since
+ the last call to flush may be lost.
+
+ """
+ if self._index is None:
+ return
+ try:
+ self.flush()
+ finally:
+ # There is currently no "close()" method for xapian databases, so
+ # we have to rely on the garbage collector. Since we never copy
+ # the _index property out of this class, there should be no cycles,
+ # so the standard python implementation should garbage collect
+ # _index straight away. A close() method is planned to be added to
+ # xapian at some point - when it is, we should call it here to make
+ # the code more robust.
+ self._index = None
+ self._indexpath = None
+ self._field_actions = None
+ self._config_modified = False
+
+ def get_doccount(self):
+ """Count the number of documents in the database.
+
+ This count will include documents which have been added or removed but
+ not yet flushed().
+
+ """
+ if self._index is None:
+ raise _errors.IndexerError("IndexerConnection has been closed")
+ return self._index.get_doccount()
+
+ def iterids(self):
+ """Get an iterator which returns all the ids in the database.
+
+ The unqiue_ids are currently returned in binary lexicographical sort
+ order, but this should not be relied on.
+
+ """
+ if self._index is None:
+ raise _errors.IndexerError("IndexerConnection has been closed")
+ return PrefixedTermIter('Q', self._index.allterms())
+
+ def get_document(self, id):
+ """Get the document with the specified unique ID.
+
+ Raises a KeyError if there is no such document. Otherwise, it returns
+ a ProcessedDocument.
+
+ """
+ if self._index is None:
+ raise _errors.IndexerError("IndexerConnection has been closed")
+ postlist = self._index.postlist('Q' + id)
+ try:
+ plitem = postlist.next()
+ except StopIteration:
+ # Unique ID not found
+ raise KeyError('Unique ID %r not found' % id)
+ try:
+ postlist.next()
+ raise _errors.IndexerError("Multiple documents " #pragma: no cover
+ "found with same unique ID")
+ except StopIteration:
+ # Only one instance of the unique ID found, as it should be.
+ pass
+
+ result = ProcessedDocument(self._field_mappings)
+ result.id = id
+ result._doc = self._index.get_document(plitem.docid)
+ return result
+
+class PrefixedTermIter(object):
+ """Iterate through all the terms with a given prefix.
+
+ """
+ def __init__(self, prefix, termiter):
+ """Initialise the prefixed term iterator.
+
+ - `prefix` is the prefix to return terms for.
+ - `termiter` is a xapian TermIterator, which should be at it's start.
+
+ """
+
+ # The algorithm used in next() currently only works for single
+ # character prefixes, so assert that the prefix is single character.
+ # To deal with multicharacter prefixes, we need to check for terms
+ # which have a starting prefix equal to that given, but then have a
+ # following uppercase alphabetic character, indicating that the actual
+ # prefix is longer than the target prefix. We then need to skip over
+ # these. Not too hard to implement, but we don't need it yet.
+ assert(len(prefix) == 1)
+
+ self._started = False
+ self._prefix = prefix
+ self._prefixlen = len(prefix)
+ self._termiter = termiter
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ """Get the next term with the specified prefix.
+
+
+ """
+ if not self._started:
+ term = self._termiter.skip_to(self._prefix).term
+ self._started = True
+ else:
+ term = self._termiter.next().term
+ if len(term) < self._prefixlen or term[:self._prefixlen] != self._prefix:
+ raise StopIteration
+ return term[self._prefixlen:]
+
+if __name__ == '__main__':
+ import doctest, sys
+ doctest.testmod (sys.modules[__name__])
diff --git a/secore/marshall.py b/secore/marshall.py
new file mode 100644
index 0000000..ebcc71d
--- /dev/null
+++ b/secore/marshall.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2007 Lemur Consulting Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+r"""marshall.py: Marshal values into strings
+
+"""
+__docformat__ = "restructuredtext en"
+
+import math
+
+def _long_to_base256_array(value, length, flip):
+ result = []
+ for i in xrange(length):
+ n = value % 256
+ if flip: n = 255 - n
+ result.insert(0, chr(n))
+ value /= 256
+ return result
+
+def float_to_string(value):
+ """Marshall a floating point number to a string which sorts in the
+ appropriate manner.
+
+ """
+ mantissa, exponent = math.frexp(value)
+ sign = '1'
+ if mantissa < 0:
+ mantissa = -mantissa
+ sign = '0'
+
+ # IEEE representation of doubles uses 11 bits for the exponent, with a bias
+ # of 1023. There's then another 52 bits in the mantissa, so we need to
+ # add 1075 to be sure that the exponent won't be negative.
+ # Even then, we check that the exponent isn't negative, and consider the
+ # value to be equal to zero if it is.
+ exponent += 1075
+ if exponent < 0: # Note - this can't happen on most architectures #pragma: no cover
+ exponent = 0
+ mantissa = 0
+ elif mantissa == 0:
+ exponent = 0
+
+ # IEEE representation of doubles uses 52 bits for the mantissa. Convert it
+ # to a 7 character string, and convert the exponent to a 2 character
+ # string.
+
+ mantissa = long(mantissa * (2**52))
+
+ digits = [sign]
+ digits.extend(_long_to_base256_array(exponent, 2, sign == '0'))
+ digits.extend(_long_to_base256_array(mantissa, 7, sign == '0'))
+
+ return ''.join(digits)
+
+def date_to_string(date):
+ """Marshall a date to a string which sorts in the appropriate manner.
+
+ """
+ return '%04d%02d%02d' % (date.year, date.month, date.day)
diff --git a/secore/parsedate.py b/secore/parsedate.py
new file mode 100644
index 0000000..684d5f2
--- /dev/null
+++ b/secore/parsedate.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2007 Lemur Consulting Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+r"""parsedate.py: Parse date strings.
+
+"""
+__docformat__ = "restructuredtext en"
+
+import datetime
+import re
+
+yyyymmdd_re = re.compile(r'(?P<year>[0-9]{4})(?P<month>[0-9]{2})(?P<day>[0-9]{2})$')
+yyyy_mm_dd_re = re.compile(r'(?P<year>[0-9]{4})([-/.])(?P<month>[0-9]{2})\2(?P<day>[0-9]{2})$')
+
+def date_from_string(value):
+ """Parse a string into a date.
+
+ If the value supplied is already a date-like object (ie, has 'year',
+ 'month' and 'day' attributes), it is returned without processing.
+
+ Supported date formats are:
+
+ - YYYYMMDD
+ - YYYY-MM-DD
+ - YYYY/MM/DD
+ - YYYY.MM.DD
+
+ """
+ if (hasattr(value, 'year')
+ and hasattr(value, 'month')
+ and hasattr(value, 'day')):
+ return value
+
+ mg = yyyymmdd_re.match(value)
+ if mg is None:
+ mg = yyyy_mm_dd_re.match(value)
+
+ if mg is not None:
+ year, month, day = (int(i) for i in mg.group('year', 'month', 'day'))
+ return datetime.date(year, month, day)
+
+ raise ValueError('Unrecognised date format')
diff --git a/secore/searchconnection.py b/secore/searchconnection.py
new file mode 100644
index 0000000..79fa509
--- /dev/null
+++ b/secore/searchconnection.py
@@ -0,0 +1,618 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2007 Lemur Consulting Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+r"""searchconnection.py: A connection to the search engine for searching.
+
+"""
+__docformat__ = "restructuredtext en"
+
+import xapian as _xapian
+from datastructures import *
+from fieldactions import *
+import fieldmappings as _fieldmappings
+import highlight as _highlight
+import errors as _errors
+import os as _os
+import cPickle as _cPickle
+
+class SearchResult(ProcessedDocument):
+ """A result from a search.
+
+ """
+ def __init__(self, msetitem, results):
+ ProcessedDocument.__init__(self, results._fieldmappings, msetitem.document)
+ self.rank = msetitem.rank
+ self._results = results
+
+ def _get_language(self, field):
+ """Get the language that should be used for a given field.
+
+ """
+ actions = self._results._conn._field_actions[field]._actions
+ for action, kwargslist in actions.iteritems():
+ if action == FieldActions.INDEX_FREETEXT:
+ for kwargs in kwargslist:
+ try:
+ return kwargs['language']
+ except KeyError:
+ pass
+ return 'none'
+
+ def summarise(self, field, maxlen=600, hl=('<b>', '</b>')):
+ """Return a summarised version of the field specified.
+
+ This will return a summary of the contents of the field stored in the
+ search result, with words which match the query highlighted.
+
+ The maximum length of the summary (in characters) may be set using the
+ maxlen parameter.
+
+ The return value will be a string holding the summary, with
+ highlighting applied. If there are multiple instances of the field in
+ the document, the instances will be joined with a newline character.
+
+ To turn off highlighting, set hl to None. Each highlight will consist
+ of the first entry in the `hl` list being placed before the word, and
+ the second entry in the `hl` list being placed after the word.
+
+ Any XML or HTML style markup tags in the field will be stripped before
+ the summarisation algorithm is applied.
+
+ """
+ highlighter = _highlight.Highlighter(language_code=self._get_language(field))
+ field = self.data[field]
+ results = []
+ text = '\n'.join(field)
+ return highlighter.makeSample(text, self._results._query, maxlen, hl)
+
+ def highlight(self, field, hl=('<b>', '</b>'), strip_tags=False):
+ """Return a highlighted version of the field specified.
+
+ This will return all the contents of the field stored in the search
+ result, with words which match the query highlighted.
+
+ The return value will be a list of strings (corresponding to the list
+ of strings which is the raw field data).
+
+ Each highlight will consist of the first entry in the `hl` list being
+ placed before the word, and the second entry in the `hl` list being
+ placed after the word.
+
+ If `strip_tags` is True, any XML or HTML style markup tags in the field
+ will be stripped before highlighting is applied.
+
+ """
+ highlighter = _highlight.Highlighter(language_code=self._get_language(field))
+ field = self.data[field]
+ results = []
+ for text in field:
+ results.append(highlighter.highlight(text, self._results._query, hl, strip_tags))
+ return results
+
+ def __repr__(self):
+ return ('<SearchResult(rank=%d, id=%r, data=%r)>' %
+ (self.rank, self.id, self.data))
+
+
+class SearchResultIter(object):
+ """An iterator over a set of results from a search.
+
+ """
+ def __init__(self, results):
+ self._results = results
+ self._iter = iter(results._mset)
+
+ def next(self):
+ msetitem = self._iter.next()
+ return SearchResult(msetitem,
+ self._results)
+
+
+class SearchResults(object):
+ """A set of results of a search.
+
+ """
+ def __init__(self, conn, enq, query, mset, fieldmappings):
+ self._conn = conn
+ self._enq = enq
+ self._query = query
+ self._mset = mset
+ self._fieldmappings = fieldmappings
+
+ def __repr__(self):
+ return ("<SearchResults(startrank=%d, "
+ "endrank=%d, "
+ "more_matches=%s, "
+ "matches_lower_bound=%d, "
+ "matches_upper_bound=%d, "
+ "matches_estimated=%d, "
+ "estimate_is_exact=%s)>" %
+ (
+ self.startrank,
+ self.endrank,
+ self.more_matches,
+ self.matches_lower_bound,
+ self.matches_upper_bound,
+ self.matches_estimated,
+ self.estimate_is_exact,
+ ))
+
+ def _get_more_matches(self):
+ # This check relies on us having asked for at least one more result
+ # than retrieved to be checked.
+ return (self.matches_lower_bound > self.endrank)
+ more_matches = property(_get_more_matches, doc=
+ """Check whether there are further matches after those in this result set.
+
+ """)
+ def _get_startrank(self):
+ return self._mset.get_firstitem()
+ startrank = property(_get_startrank, doc=
+ """Get the rank of the first item in the search results.
+
+ This corresponds to the "startrank" parameter passed to the search() method.
+
+ """)
+ def _get_endrank(self):
+ return self._mset.get_firstitem() + len(self._mset)
+ endrank = property(_get_endrank, doc=
+ """Get the rank of the item after the end of the search results.
+
+ If there are sufficient results in the index, this corresponds to the
+ "endrank" parameter passed to the search() method.
+
+ """)
+ def _get_lower_bound(self):
+ return self._mset.get_matches_lower_bound()
+ matches_lower_bound = property(_get_lower_bound, doc=
+ """Get a lower bound on the total number of matching documents.
+
+ """)
+ def _get_upper_bound(self):
+ return self._mset.get_matches_upper_bound()
+ matches_upper_bound = property(_get_upper_bound, doc=
+ """Get an upper bound on the total number of matching documents.
+
+ """)
+ def _get_estimated(self):
+ return self._mset.get_matches_estimated()
+ matches_estimated = property(_get_estimated, doc=
+ """Get an estimate for the total number of matching documents.
+
+ """)
+ def _estimate_is_exact(self):
+ return self._mset.get_matches_lower_bound() == \
+ self._mset.get_matches_upper_bound()
+ estimate_is_exact = property(_estimate_is_exact, doc=
+ """Check whether the estimated number of matching documents is exact.
+
+ If this returns true, the estimate given by the `matches_estimated`
+ property is guaranteed to be correct.
+
+ If this returns false, it is possible that the actual number of matching
+ documents is different from the number given by the `matches_estimated`
+ property.
+
+ """)
+
+ def get_hit(self, index):
+ """Get the hit with a given index.
+
+ """
+ msetitem = self._mset.get_hit(index)
+ return SearchResult(msetitem, self)
+ __getitem__ = get_hit
+
+ def __iter__(self):
+ """Get an iterator over the hits in the search result.
+
+ The iterator returns the results in increasing order of rank.
+
+ """
+ return SearchResultIter(self)
+
+class SearchConnection(object):
+ """A connection to the search engine for searching.
+
+ The connection will access a view of the database.
+
+ """
+
+ def __init__(self, indexpath):
+ """Create a new connection to the index for searching.
+
+ There may only an arbitrary number of search connections for a
+ particular database open at a given time (regardless of whether there
+ is a connection for indexing open as well).
+
+ If the database doesn't exist, an exception will be raised.
+
+ """
+ self._index = _xapian.Database(indexpath)
+ self._indexpath = indexpath
+
+ # Read the actions.
+ self._load_config()
+
+ def _get_sort_type(self, field):
+ """Get the sort type that should be used for a given field.
+
+ """
+ actions = self._field_actions[field]._actions
+ for action, kwargslist in actions.iteritems():
+ if action == FieldActions.SORT_AND_COLLAPSE:
+ for kwargs in kwargslist:
+ return kwargs['type']
+
+ def _load_config(self):
+ """Load the configuration for the database.
+
+ """
+ # Note: this code is basically duplicated in the IndexerConnection
+ # class. Move it to a shared location.
+ config_file = _os.path.join(self._indexpath, 'config')
+ if not _os.path.exists(config_file):
+ self._field_mappings = _fieldmappings.FieldMappings()
+ return
+ fd = open(config_file)
+ config_str = fd.read()
+ fd.close()
+
+ (self._field_actions, mappings, next_docid) = _cPickle.loads(config_str)
+ self._field_mappings = _fieldmappings.FieldMappings(mappings)
+
+ def reopen(self):
+ """Reopen the connection.
+
+ This updates the revision of the index which the connection references
+ to the latest flushed revision.
+
+ """
+ if self._index is None:
+ raise _errors.SearchError("SearchConnection has been closed")
+ self._index.reopen()
+ # Re-read the actions.
+ self._load_config()
+
+ def close(self):
+ """Close the connection to the database.
+
+ It is important to call this method before allowing the class to be
+ garbage collected to ensure that the connection is cleaned up promptly.
+
+ No other methods may be called on the connection after this has been
+ called. (It is permissible to call close() multiple times, but
+ only the first call will have any effect.)
+
+ If an exception occurs, the database will be closed, but changes since
+ the last call to flush may be lost.
+
+ """
+ if self._index is None:
+ return
+ # There is currently no "close()" method for xapian databases, so
+ # we have to rely on the garbage collector. Since we never copy
+ # the _index property out of this class, there should be no cycles,
+ # so the standard python implementation should garbage collect
+ # _index straight away. A close() method is planned to be added to
+ # xapian at some point - when it is, we should call it here to make
+ # the code more robust.
+ self._index = None
+ self._indexpath = None
+ self._field_actions = None
+ self._field_mappings = None
+
+ def get_doccount(self):
+ """Count the number of documents in the database.
+
+ This count will include documents which have been added or removed but
+ not yet flushed().
+
+ """
+ if self._index is None:
+ raise _errors.SearchError("SearchConnection has been closed")
+ return self._index.get_doccount()
+
+ def get_document(self, id):
+ """Get the document with the specified unique ID.
+
+ Raises a KeyError if there is no such document. Otherwise, it returns
+ a ProcessedDocument.
+
+ """
+ if self._index is None:
+ raise _errors.SearchError("SearchConnection has been closed")
+ postlist = self._index.postlist('Q' + id)
+ try:
+ plitem = postlist.next()
+ except StopIteration:
+ # Unique ID not found
+ raise KeyError('Unique ID %r not found' % id)
+ try:
+ postlist.next()
+ raise _errors.SearchError("Multiple documents " #pragma: no cover
+ "found with same unique ID")
+ except StopIteration:
+ # Only one instance of the unique ID found, as it should be.
+ pass
+
+ result = ProcessedDocument(self._field_mappings)
+ result.id = id
+ result._doc = self._index.get_document(plitem.docid)
+ return result
+
+ OP_AND = _xapian.Query.OP_AND
+ OP_OR = _xapian.Query.OP_OR
+ def query_composite(self, operator, queries):
+ """Build a composite query from a list of queries.
+
+ The queries are combined with the supplied operator, which is either
+ SearchConnection.OP_AND or SearchConnection.OP_OR.
+
+ """
+ if self._index is None:
+ raise _errors.SearchError("SearchConnection has been closed")
+ return _xapian.Query(operator, list(queries))
+
+ def query_filter(self, query, filter):
+ """Filter a query with another query.
+
+ Documents will only match the resulting query if they match both
+ queries, but will be weighted according to only the first query.
+
+ - `query`: The query to filter.
+ - `filter`: The filter to apply to the query.
+
+ """
+ if self._index is None:
+ raise _errors.SearchError("SearchConnection has been closed")
+ if not isinstance(filter, _xapian.Query):
+ raise _errors.SearchError("Filter must be a Xapian Query object")
+ return _xapian.Query(_xapian.Query.OP_FILTER, query, filter)
+
+ def query_range(self, field, begin, end):
+ """Create a query for a range search.
+
+ This creates a query which matches only those documents which have a
+ field value in the specified range.
+
+ Begin and end must be appropriate values for the field, according to
+ the 'type' parameter supplied to the SORTABLE action for the field.
+
+ The begin and end values are both inclusive - any documents with a
+ value equal to begin or end will be returned (unless end is less than
+ begin, in which case no documents will be returned).
+
+ """
+ if self._index is None:
+ raise _errors.SearchError("SearchConnection has been closed")
+
+ sorttype = self._get_sort_type(field)
+ marshaller = SortableMarshaller(False)
+ fn = marshaller.get_marshall_function(field, sorttype)
+ begin = fn(field, begin)
+ end = fn(field, end)
+
+ slot = self._field_mappings.get_slot(field)
+ return _xapian.Query(_xapian.Query.OP_VALUE_RANGE, slot, begin, end)
+
+ def _prepare_queryparser(self, allow, deny, default_op):
+ """Prepare (and return) a query parser using the specified fields and
+ operator.
+
+ """
+ if self._index is None:
+ raise _errors.SearchError("SearchConnection has been closed")
+ if allow is not None and deny is not None:
+ raise _errors.SearchError("Cannot specify both `allow` and `deny`")
+ qp = _xapian.QueryParser()
+ qp.set_database(self._index)
+ qp.set_default_op(default_op)
+
+ if allow is None:
+ allow = [key for key in self._field_actions]
+ if deny is not None:
+ allow = [key for key in allow if key not in deny]
+
+ for field in allow:
+ actions = self._field_actions[field]._actions
+ for action, kwargslist in actions.iteritems():
+ if action == FieldActions.INDEX_EXACT:
+ # FIXME - need patched version of xapian to add exact prefixes
+ #qp.add_exact_prefix(field, self._field_mappings.get_prefix(field))
+ qp.add_prefix(field, self._field_mappings.get_prefix(field))
+ if action == FieldActions.INDEX_FREETEXT:
+ qp.add_prefix(field, self._field_mappings.get_prefix(field))
+ for kwargs in kwargslist:
+ try:
+ lang = kwargs['language']
+ qp.set_stemmer(_xapian.Stem(lang))
+ qp.set_stemming_strategy(qp.STEM_SOME)
+ except KeyError:
+ pass
+ return qp
+
+ def query_parse(self, string, allow=None, deny=None, default_op=OP_AND):
+ """Parse a query string.
+
+ This is intended for parsing queries entered by a user. If you wish to
+ combine structured queries, it is generally better to use the other
+ query building methods, such as `query_composite`.
+
+ - `string`: The string to parse.
+ - `allow`: A list of fields to allow in the query.
+ - `deny`: A list of fields not to allow in the query.
+
+ Only one of `allow` and `deny` may be specified.
+
+ If any of the entries in `allow` or `deny` are not present in the
+ configuration for the database, an exception will be raised.
+
+ Returns a Query object, which may be passed to the search() method, or
+ combined with other queries.
+
+ """
+ qp = self._prepare_queryparser(allow, deny, default_op)
+ try:
+ return qp.parse_query(string)
+ except _xapian.QueryParserError, e:
+ # If we got a parse error, retry without boolean operators (since
+ # these are the usual cause of the parse error).
+ return qp.parse_query(string, 0)
+
+ def query_field(self, field, value, default_op=OP_AND):
+ """A query for a single field.
+
+ """
+ if self._index is None:
+ raise _errors.SearchError("SearchConnection has been closed")
+ try:
+ actions = self._field_actions[field]._actions
+ except KeyError:
+ actions = {}
+
+ # need to check on field type, and stem / split as appropriate
+ for action, kwargslist in actions.iteritems():
+ if action == FieldActions.INDEX_EXACT:
+ prefix = self._field_mappings.get_prefix(field)
+ if len(value) > 0:
+ chval = ord(value[0])
+ if chval >= ord('A') and chval <= ord('Z'):
+ prefix = prefix + ':'
+ return _xapian.Query(prefix + value)
+ if action == FieldActions.INDEX_FREETEXT:
+ qp = _xapian.QueryParser()
+ qp.set_default_op(default_op)
+ prefix = self._field_mappings.get_prefix(field)
+ for kwargs in kwargslist:
+ try:
+ lang = kwargs['language']
+ qp.set_stemmer(_xapian.Stem(lang))
+ qp.set_stemming_strategy(qp.STEM_SOME)
+ except KeyError:
+ pass
+ return qp.parse_query(value,
+ qp.FLAG_PHRASE | qp.FLAG_BOOLEAN | qp.FLAG_LOVEHATE,
+ prefix)
+
+ return _xapian.Query()
+
+ def query_all(self):
+ """A query which matches all the documents in the database.
+
+ """
+ return _xapian.Query('')
+
+ def spell_correct(self, string, allow=None, deny=None):
+ """Correct a query spelling.
+
+ This returns a version of the query string with any misspelt words
+ corrected.
+
+ - `allow`: A list of fields to allow in the query.
+ - `deny`: A list of fields not to allow in the query.
+
+ Only one of `allow` and `deny` may be specified.
+
+ If any of the entries in `allow` or `deny` are not present in the
+ configuration for the database, an exception will be raised.
+
+ """
+ qp = self._prepare_queryparser(allow, deny, self.OP_AND)
+ qp.parse_query(string, qp.FLAG_PHRASE|qp.FLAG_BOOLEAN|qp.FLAG_LOVEHATE|qp.FLAG_SPELLING_CORRECTION)
+ corrected = qp.get_corrected_query_string()
+ if len(corrected) == 0:
+ if isinstance(string, unicode):
+ # Encode as UTF-8 for consistency - this happens automatically
+ # to values passed to Xapian.
+ return string.encode('utf-8')
+ return string
+ return corrected
+
+ def search(self, query, startrank, endrank,
+ checkatleast=0, sortby=None, collapse=None):
+ """Perform a search, for documents matching a query.
+
+ - `query` is the query to perform.
+ - `startrank` is the rank of the start of the range of matching
+ documents to return (ie, the result with this rank will be returned).
+ ranks start at 0, which represents the "best" matching document.
+ - `endrank` is the rank at the end of the range of matching documents
+ to return. This is exclusive, so the result with this rank will not
+ be returned.
+ - `checkatleast` is the minimum number of results to check for: the
+ estimate of the total number of matches will always be exact if
+ the number of matches is less than `checkatleast`.
+ - `sortby` is the name of a field to sort by. It may be preceded by a
+ '+' or a '-' to indicate ascending or descending order
+ (respectively). If the first character is neither '+' or '-', the
+ sort will be in ascending order.
+ - `collapse` is the name of a field to collapse the result documents
+ on. If this is specified, there will be at most one result in the
+ result set for each value of the field.
+
+ """
+ if self._index is None:
+ raise _errors.SearchError("SearchConnection has been closed")
+ enq = _xapian.Enquire(self._index)
+ enq.set_query(query)
+
+ if sortby is not None:
+ asc = True
+ if sortby[0] == '-':
+ asc = False
+ sortby = sortby[1:]
+ elif sortby[0] == '+':
+ sortby = sortby[1:]
+
+ try:
+ slotnum = self._field_mappings.get_slot(sortby)
+ except KeyError:
+ raise _errors.SearchError("Field %r was not indexed for sorting" % sortby)
+
+ # Note: we invert the "asc" parameter, because xapian treats
+ # "ascending" as meaning "higher values are better"; in other
+ # words, it considers "ascending" to mean return results in
+ # descending order.
+ enq.set_sort_by_value_then_relevance(slotnum, not asc)
+
+ if collapse is not None:
+ try:
+ slotnum = self._field_mappings.get_slot(collapse)
+ except KeyError:
+ raise _errors.SearchError("Field %r was not indexed for collapsing" % collapse)
+ enq.set_collapse_key(slotnum)
+
+ maxitems = max(endrank - startrank, 0)
+ # Always check for at least one more result, so we can report whether
+ # there are more matches.
+ checkatleast = max(checkatleast, endrank + 1)
+
+ enq.set_docid_order(enq.DONT_CARE)
+
+ # Repeat the search until we don't get a DatabaseModifiedError
+ while True:
+ try:
+ mset = enq.get_mset(startrank, maxitems, checkatleast)
+ break
+ except _xapian.DatabaseModifiedError, e:
+ self.reopen()
+ return SearchResults(self, enq, query, mset, self._field_mappings)
+
+if __name__ == '__main__':
+ import doctest, sys
+ doctest.testmod (sys.modules[__name__])