diff options
author | Marco Pesenti Gritti <marco@localhost.localdomain> | 2007-07-11 19:37:48 (GMT) |
---|---|---|
committer | Marco Pesenti Gritti <marco@localhost.localdomain> | 2007-07-11 19:37:48 (GMT) |
commit | 3a3a2c361fbf670ee5375e669d34be386f6924f8 (patch) | |
tree | b436ec29c47fdb983e8355c31768a1e3d2b10a6c | |
parent | cb8a3f7e34b07a4d3fb3ebb3cb7eddceaec0e73d (diff) |
Add secore. Cut and paste from http://flaxcode.googlecode.com/svn/trunk/libs/secore/secore/.
-rw-r--r-- | Makefile.am | 2 | ||||
-rw-r--r-- | configure.ac | 1 | ||||
-rw-r--r-- | secore/Makefile.am | 12 | ||||
-rw-r--r-- | secore/__init__.py | 30 | ||||
-rw-r--r-- | secore/datastructures.py | 216 | ||||
-rw-r--r-- | secore/errors.py | 40 | ||||
-rw-r--r-- | secore/fieldactions.py | 358 | ||||
-rw-r--r-- | secore/fieldmappings.py | 123 | ||||
-rw-r--r-- | secore/highlight.py | 310 | ||||
-rw-r--r-- | secore/indexerconnection.py | 380 | ||||
-rw-r--r-- | secore/marshall.py | 73 | ||||
-rw-r--r-- | secore/parsedate.py | 56 | ||||
-rw-r--r-- | secore/searchconnection.py | 618 |
13 files changed, 2218 insertions, 1 deletions
diff --git a/Makefile.am b/Makefile.am index 8060aae..abf71cf 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,4 +1,4 @@ -SUBDIRS = bin etc src +SUBDIRS = bin etc secore src test: @cd tests diff --git a/configure.ac b/configure.ac index 4824635..c60229a 100644 --- a/configure.ac +++ b/configure.ac @@ -12,6 +12,7 @@ AC_OUTPUT([ Makefile bin/Makefile etc/Makefile +secore/Makefile src/Makefile src/olpc/Makefile src/olpc/datastore/Makefile diff --git a/secore/Makefile.am b/secore/Makefile.am new file mode 100644 index 0000000..393ba8f --- /dev/null +++ b/secore/Makefile.am @@ -0,0 +1,12 @@ +datastoredir = $(pythondir)/secore +datastore_PYTHON = \ + __init__.py \ + datastructures.py \ + fieldmappings.py \ + searchconnection.py \ + errors.py \ + highlight.py \ + marshall.py \ + fieldactions.py \ + indexerconnection.py \ + parsedate.py diff --git a/secore/__init__.py b/secore/__init__.py new file mode 100644 index 0000000..157fea4 --- /dev/null +++ b/secore/__init__.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python +# +# Copyright (C) 2007 Lemur Consulting Ltd +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +"""Search engine Core. + +See the accompanying documentation for details. In particular, there should be +an accompanying file "introduction.html" (or "introduction.rst") which gives +details of how to use the secore package. + +""" +__docformat__ = "restructuredtext en" + +from datastructures import * +from errors import * +from indexerconnection import * +from searchconnection import * diff --git a/secore/datastructures.py b/secore/datastructures.py new file mode 100644 index 0000000..414625d --- /dev/null +++ b/secore/datastructures.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python +# +# Copyright (C) 2007 Lemur Consulting Ltd +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +r"""datastructures.py: Datastructures for search engine core. + +""" +__docformat__ = "restructuredtext en" + +import xapian as _xapian +import cPickle as _cPickle + +class Field(object): + # Use __slots__ because we're going to have very many Field objects in + # typical usage. + __slots__ = 'name', 'value' + + def __init__(self, name, value): + self.name = name + self.value = value + + def __repr__(self): + return 'Field(%r, %r)' % (self.name, self.value) + +class UnprocessedDocument(object): + """A unprocessed document to be passed to the indexer. + + This represents an item to be processed and stored in the search engine. + Each document will be processed by the indexer to generate a + ProcessedDocument, which can then be stored in the search engine index. + + Note that some information in an UnprocessedDocument will not be + represented in the ProcessedDocument: therefore, it is not possible to + retrieve an UnprocessedDocument from the search engine index. + + An unprocessed document is a simple container with two attributes: + + - `fields` is a list of Field objects. + - `id` is a string holding a unique identifier for the document (or + None to get the database to allocate a unique identifier automatically + when the document is added). + + """ + + __slots__ = 'id', 'fields', + def __init__(self, id=None, fields=None): + self.id = id + if fields is None: + self.fields = [] + else: + self.fields = fields + + def __repr__(self): + return 'UnprocessedDocument(%r, %r)' % (self.id, self.fields) + +class ProcessedDocument(object): + """A processed document, as stored in the index. + + This represents an item which is ready to be stored in the search engine, + or which has been returned by the search engine. + + """ + + __slots__ = '_doc', '_fieldmappings', '_data', + def __init__(self, fieldmappings, xapdoc=None): + """Create a ProcessedDocument. + + `fieldmappings` is the configuration from a database connection used lookup + the configuration to use to store each field. + + If supplied, `xapdoc` is a Xapian document to store in the processed + document. Otherwise, a new Xapian document is created. + + """ + if xapdoc is None: + self._doc = _xapian.Document() + else: + self._doc = xapdoc + self._fieldmappings = fieldmappings + self._data = None + + def add_term(self, field, term, wdfinc=1, positions=None): + """Add a term to the document. + + Terms are the main unit of information used for performing searches. + + - `field` is the field to add the term to. + - `term` is the term to add. + - `wdfinc` is the value to increase the within-document-frequency + measure for the term by. + - `positions` is the positional information to add for the term. + This may be None to indicate that there is no positional information, + or may be an integer to specify one position, or may be a sequence of + integers to specify several positions. (Note that the wdf is not + increased automatically for each position: if you add a term at 7 + positions, and the wdfinc value is 2, the total wdf for the term will + only be increased by 2, not by 14.) + + """ + prefix = self._fieldmappings.get_prefix(field) + if len(term) > 0: + # We use the following check, rather than "isupper()" to ensure + # that we match the check performed by the queryparser, regardless + # of our locale. + if ord(term[0]) >= ord('A') and ord(term[0]) <= ord('Z'): + prefix = prefix + ':' + if positions is None: + self._doc.add_term(prefix + term, wdfinc) + elif isinstance(positions, int): + self._doc.add_posting(prefix + term, positions, wdfinc) + else: + self._doc.add_term(prefix + term, wdfinc) + for pos in positions: + self._doc.add_posting(prefix + term, pos, 0) + + def add_value(self, field, value): + """Add a value to the document. + + Values are additional units of information used when performing + searches. Note that values are _not_ intended to be used to store + information for display in the search results - use the document data + for that. The intention is that as little information as possible is + stored in values, so that they can be accessed as quickly as possible + during the search operation. + + Unlike terms, each document may have at most one value in each field + (whereas there may be an arbitrary number of terms in a given field). + If an attempt to add multiple values to a single field is made, only + the last value added will be stored. + + """ + slot = self._fieldmappings.get_slot(field) + self._doc.add_value(slot, value) + + def get_value(self, field): + """Get a value from the document. + + """ + slot = self._fieldmappings.get_slot(field) + return self._doc.get_value(slot) + + def prepare(self): + """Prepare the document for adding to a xapian database. + + This updates the internal xapian document with any changes which have + been made, and then returns it. + + """ + if self._data is not None: + self._doc.set_data(_cPickle.dumps(self._data, 2)) + self._data = None + return self._doc + + def _get_data(self): + if self._data is None: + rawdata = self._doc.get_data() + if rawdata == '': + self._data = {} + else: + self._data = _cPickle.loads(rawdata) + return self._data + def _set_data(self, data): + if not isinstance(data, dict): + raise TypeError("Cannot set data to any type other than a dict") + self._data = data + data = property(_get_data, _set_data, doc= + """The data stored in this processed document. + + This data is a dictionary of entries, where the key is a fieldname, and the + value is a list of strings. + + """) + + def _get_id(self): + tl = self._doc.termlist() + try: + term = tl.skip_to('Q').term + if len(term) == 0 or term[0] != 'Q': + return None + except StopIteration: + return None + return term[1:] + def _set_id(self, id): + tl = self._doc.termlist() + try: + term = tl.skip_to('Q').term + except StopIteration: + term = '' + if len(term) != 0 and term[0] == 'Q': + self._doc.remove_term(term) + if id is not None: + self._doc.add_term('Q' + id, 0) + id = property(_get_id, _set_id, doc= + """The unique ID for this document. + + """) + + def __repr__(self): + return '<ProcessedDocument(%r)>' % (self.id) + +if __name__ == '__main__': + import doctest, sys + doctest.testmod (sys.modules[__name__]) diff --git a/secore/errors.py b/secore/errors.py new file mode 100644 index 0000000..b6ad00f --- /dev/null +++ b/secore/errors.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python +# +# Copyright (C) 2007 Lemur Consulting Ltd +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +r"""errors.py: Exceptions for the search engine core. + +""" +__docformat__ = "restructuredtext en" + +class SearchEngineError(Exception): + r"""Base class for exceptions thrown by the search engine. + + Any errors generated by the python level interface to xapian will be + instances of this class or its subclasses. + + """ + +class IndexerError(SearchEngineError): + r"""Class used to report errors from the indexing API. + + """ + +class SearchError(SearchEngineError): + r"""Class used to report errors from the search API. + + """ + diff --git a/secore/fieldactions.py b/secore/fieldactions.py new file mode 100644 index 0000000..c595f0b --- /dev/null +++ b/secore/fieldactions.py @@ -0,0 +1,358 @@ +#!/usr/bin/env python +# +# Copyright (C) 2007 Lemur Consulting Ltd +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +r"""fieldactions.py: Definitions and implementations of field actions. + +""" +__docformat__ = "restructuredtext en" + +import errors as _errors +import marshall as _marshall +import xapian as _xapian +import parsedate as _parsedate + +def _act_store_content(fieldname, doc, value, context): + """Perform the STORE_CONTENT action. + + """ + try: + fielddata = doc.data[fieldname] + except KeyError: + fielddata = [] + doc.data[fieldname] = fielddata + fielddata.append(value) + +def _act_index_exact(fieldname, doc, value, context): + """Perform the INDEX_EXACT action. + + """ + doc.add_term(fieldname, value, 0) + +def _act_index_freetext(fieldname, doc, value, context, weight=1, + language=None, stop=None, spell=False, + nopos=False, noprefix=False): + """Perform the INDEX_FREETEXT action. + + """ + termgen = _xapian.TermGenerator() + if language is not None: + termgen.set_stemmer(_xapian.Stem(language)) + + if stop is not None: + stopper = _xapian.SimpleStopper() + for term in stop: + stopper.add (term) + termgen.set_stopper (stopper) + + if spell: + termgen.set_database(context.index) + termgen.set_flags(termgen.FLAG_SPELLING) + + termgen.set_document(doc._doc) + termgen.set_termpos(context.current_position) + if nopos: + termgen.index_text_without_positions(value, weight, '') + else: + termgen.index_text(value, weight, '') + + if not noprefix: + # Store a second copy of the term with a prefix, for field-specific + # searches. + prefix = doc._fieldmappings.get_prefix(fieldname) + if len(prefix) != 0: + termgen.set_termpos(context.current_position) + if nopos: + termgen.index_text_without_positions(value, weight, prefix) + else: + termgen.index_text(value, weight, prefix) + + # Add a gap between each field instance, so that phrase searches don't + # match across instances. + termgen.increase_termpos(10) + context.current_position = termgen.get_termpos() + +class SortableMarshaller(object): + """Implementation of marshalling for sortable values. + + """ + def __init__(self, indexing=True): + if indexing: + self._err = _errors.IndexerError + else: + self._err = _errors.SearchError + + def marshall_string(self, fieldname, value): + """Marshall a value for sorting in lexicograpical order. + + This returns the input as the output, since strings already sort in + lexicographical order. + + """ + return value + + def marshall_float(self, fieldname, value): + """Marshall a value for sorting as a floating point value. + + """ + # convert the value to a float + try: + value = float(value) + except ValueError: + raise self._err("Value supplied to field %r must be a " + "valid floating point number: was %r" % + (fieldname, value)) + return _marshall.float_to_string(value) + + def marshall_date(self, fieldname, value): + """Marshall a value for sorting as a date. + + """ + try: + value = _parsedate.date_from_string(value) + except ValueError, e: + raise self._err("Value supplied to field %r must be a " + "valid date: was %r: error is '%s'" % + (fieldname, value, str(e))) + return _marshall.date_to_string(value) + + def get_marshall_function(self, fieldname, sorttype): + """Get a function used to marshall values of a given sorttype. + + """ + try: + return { + None: self.marshall_string, + 'string': self.marshall_string, + 'float': self.marshall_float, + 'date': self.marshall_date, + }[sorttype] + except KeyError: + raise self._err("Unknown sort type %r for field %r" % + (sorttype, fieldname)) + + +def _act_sort_and_collapse(fieldname, doc, value, context, type=None): + """Perform the SORTABLE action. + + """ + marshaller = SortableMarshaller() + fn = marshaller.get_marshall_function(fieldname, type) + value = fn(fieldname, value) + doc.add_value(fieldname, value) + +class ActionContext(object): + """The context in which an action is performed. + + This is just used to pass term generators, word positions, and the like + around. + + """ + def __init__(self, index): + self.current_language = None + self.current_position = 0 + self.index = index + +class FieldActions(object): + """An object describing the actions to be performed on a field. + + The supported actions are: + + - `STORE_CONTENT`: store the unprocessed content of the field in the search + engine database. All fields which need to be displayed or used when + displaying the search results need to be given this action. + + - `INDEX_EXACT`: index the exact content of the field as a single search + term. Fields whose contents need to be searchable as an "exact match" + need to be given this action. + + - `INDEX_FREETEXT`: index the content of this field as text. The content + will be split into terms, allowing free text searching of the field. Four + optional parameters may be supplied: + + - 'weight' is a multiplier to apply to the importance of the field. This + must be an integer, and the default value is 1. + - 'language' is the language to use when processing the field. This can + be expressed as an ISO 2-letter language code. The supported languages + are those supported by the xapian core in use. + - 'stop' is an iterable of stopwords to filter out of the generated + terms. Note that due to Xapian design, only non-positional terms are + affected, so this is of limited use. + - 'spell' is a boolean flag - if true, the contents of the field will be + used for spelling correction. + - 'nopos' is a boolean flag - if true, positional information is not + stored. + - 'noprefix' is a boolean flag - if true, prevents terms with the field + prefix being generated. This means that searches specific to this + field will not work, and thus should only be used for special cases. + + - `SORTABLE`: index the content of the field such that it can be used to + sort result sets. It also allows result sets to be restricted to those + documents with a field values in a given range. One optional parameter + may be supplied: + + - 'type' is a value indicating how to sort the field. It has several + possible values: + + - 'string' - sort in lexicographic (ie, alphabetical) order. + This is the default, used if no type is set. + - 'float' - treat the values as (decimal representations of) floating + point numbers, and sort in numerical order . The values in the field + must be valid floating point numbers (according to Python's float() + function). + - 'date' - sort in date order. The values must be valid dates (either + Python datetime.date objects, or ISO 8601 format (ie, YYYYMMDD or + YYYY-MM-DD). + + - `COLLAPSE`: index the content of the field such that it can be used to + "collapse" result sets, such that only the highest result with each value + of the field will be returned. + + """ + + # See the class docstring for the meanings of the following constants. + STORE_CONTENT = 1 + INDEX_EXACT = 2 + INDEX_FREETEXT = 3 + SORTABLE = 4 + COLLAPSE = 5 + + # Sorting and collapsing store the data in a value, but the format depends + # on the sort type. Easiest way to implement is to treat them as the same + # action. + SORT_AND_COLLAPSE = -1 + + # NEED_SLOT is a flag used to indicate that an action needs a slot number + NEED_SLOT = 1 + # NEED_PREFIX is a flag used to indicate that an action needs a prefix + NEED_PREFIX = 2 + + def __init__(self, fieldname): + # Dictionary of actions, keyed by type. + self._actions = {} + self._fieldname = fieldname + + def add(self, field_mappings, action, **kwargs): + """Add an action to perform on a field. + + """ + if action not in (FieldActions.STORE_CONTENT, + FieldActions.INDEX_EXACT, + FieldActions.INDEX_FREETEXT, + FieldActions.SORTABLE, + FieldActions.COLLAPSE,): + raise _errors.IndexerError("Unknown field action: %r" % action) + + info = self._action_info[action] + + # Check parameter names + for key in kwargs.keys(): + if key not in info[1]: + raise _errors.IndexerError("Unknown parameter name for action %r: %r" % (info[0], key)) + + # Fields cannot be indexed both with "EXACT" and "FREETEXT": whilst we + # could implement this, the query parser wouldn't know what to do with + # searches. + if action == FieldActions.INDEX_EXACT: + if FieldActions.INDEX_FREETEXT in self._actions: + raise _errors.IndexerError("Field %r is already marked for indexing " + "as free text: cannot mark for indexing " + "as exact text as well" % self._fieldname) + if action == FieldActions.INDEX_FREETEXT: + if FieldActions.INDEX_EXACT in self._actions: + raise _errors.IndexerError("Field %r is already marked for indexing " + "as exact text: cannot mark for indexing " + "as free text as well" % self._fieldname) + + # Fields cannot be indexed as more than one type for "SORTABLE": to + # implement this, we'd need to use a different prefix for each sortable + # type, but even then the search end wouldn't know what to sort on when + # searching. Also, if they're indexed as "COLLAPSE", the value must be + # stored in the right format for the type "SORTABLE". + if action == FieldActions.SORTABLE or action == FieldActions.COLLAPSE: + if action == FieldActions.COLLAPSE: + sorttype = None + else: + try: + sorttype = kwargs['type'] + except KeyError: + sorttype = 'string' + kwargs['type'] = sorttype + action = FieldActions.SORT_AND_COLLAPSE + + try: + oldsortactions = self._actions[FieldActions.SORT_AND_COLLAPSE] + except KeyError: + oldsortactions = () + + if len(oldsortactions) > 0: + for oldsortaction in oldsortactions: + oldsorttype = oldsortaction['type'] + + if sorttype == oldsorttype or oldsorttype is None: + # Use new type + self._actions[action] = [] + elif sorttype is None: + # Use old type + return + else: + raise _errors.IndexerError("Field %r is already marked for " + "sorting, with a different " + "sort type" % self._fieldname) + + if self.NEED_PREFIX in info[3]: + field_mappings.add_prefix(self._fieldname) + if self.NEED_SLOT in info[3]: + field_mappings.add_slot(self._fieldname) + + # Make an entry for the action + if action not in self._actions: + self._actions[action] = [] + + # Check for repetitions of actions + for old_action in self._actions[action]: + if old_action == kwargs: + return + + # Append the action to the list of actions + self._actions[action].append(kwargs) + + def perform(self, doc, value, context): + """Perform the actions on the field. + + - `doc` is a ProcessedDocument to store the result of the actions in. + - `value` is a string holding the value of the field. + - `context` is an ActionContext object used to keep state in. + + """ + for type, actionlist in self._actions.iteritems(): + info = self._action_info[type] + for kwargs in actionlist: + info[2](self._fieldname, doc, value, context, **kwargs) + + _action_info = { + STORE_CONTENT: ('STORE_CONTENT', (), _act_store_content, (), ), + INDEX_EXACT: ('INDEX_EXACT', (), _act_index_exact, (NEED_PREFIX,), ), + INDEX_FREETEXT: ('INDEX_FREETEXT', ('weight', 'language', 'stop', 'spell', 'nopos', 'noprefix', ), + _act_index_freetext, (NEED_PREFIX, ), ), + SORTABLE: ('SORTABLE', ('type', ), None, (NEED_SLOT,), ), + COLLAPSE: ('COLLAPSE', (), None, (NEED_SLOT,), ), + SORT_AND_COLLAPSE: ('SORT_AND_COLLAPSE', ('type', ), _act_sort_and_collapse, (NEED_SLOT,), ), + } + +if __name__ == '__main__': + import doctest, sys + doctest.testmod (sys.modules[__name__]) diff --git a/secore/fieldmappings.py b/secore/fieldmappings.py new file mode 100644 index 0000000..3838ce5 --- /dev/null +++ b/secore/fieldmappings.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python +# +# Copyright (C) 2007 Lemur Consulting Ltd +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +r"""fieldmappings.py: Mappings from field names to term prefixes, etc. + +""" +__docformat__ = "restructuredtext en" + +import cPickle as _cPickle + +class FieldMappings(object): + """Mappings from field names to term prefixes, slot values, etc. + + The following mappings are maintained: + + - a mapping from field name to the string prefix to insert at the start of + terms. + - a mapping from field name to the slot numbers to store the field contents + in. + + """ + __slots__ = '_prefixes', '_prefixcount', '_slots', '_slotcount', + + def __init__(self, serialised=None): + """Create a new field mapping object, or unserialise a saved one. + + """ + if serialised is not None: + (self._prefixes, self._prefixcount, + self._slots, self._slotcount) = _cPickle.loads(serialised) + else: + self._prefixes = {} + self._prefixcount = 0 + self._slots = {} + self._slotcount = 0 + + def _genPrefix(self): + """Generate a previously unused prefix. + + Prefixes are uppercase letters, and start with 'X' (this is a Xapian + convention, for compatibility with other Xapian tools: other starting + letters are reserved for special meanings): + + >>> maps = FieldMappings() + >>> maps._genPrefix() + 'XA' + >>> maps._genPrefix() + 'XB' + >>> [maps._genPrefix() for i in xrange(60)] + ['XC', 'XD', 'XE', 'XF', 'XG', 'XH', 'XI', 'XJ', 'XK', 'XL', 'XM', 'XN', 'XO', 'XP', 'XQ', 'XR', 'XS', 'XT', 'XU', 'XV', 'XW', 'XX', 'XY', 'XZ', 'XAA', 'XBA', 'XCA', 'XDA', 'XEA', 'XFA', 'XGA', 'XHA', 'XIA', 'XJA', 'XKA', 'XLA', 'XMA', 'XNA', 'XOA', 'XPA', 'XQA', 'XRA', 'XSA', 'XTA', 'XUA', 'XVA', 'XWA', 'XXA', 'XYA', 'XZA', 'XAB', 'XBB', 'XCB', 'XDB', 'XEB', 'XFB', 'XGB', 'XHB', 'XIB', 'XJB'] + >>> maps = FieldMappings() + >>> [maps._genPrefix() for i in xrange(27*26 + 5)][-10:] + ['XVZ', 'XWZ', 'XXZ', 'XYZ', 'XZZ', 'XAAA', 'XBAA', 'XCAA', 'XDAA', 'XEAA'] + """ + res = [] + self._prefixcount += 1 + num = self._prefixcount + while num != 0: + ch = (num - 1) % 26 + res.append(chr(ch + ord('A'))) + num -= ch + num = num // 26 + return 'X' + ''.join(res) + + def get_prefix(self, fieldname): + """Get the prefix used for a given field name. + + """ + return self._prefixes[fieldname] + + def get_slot(self, fieldname): + """Get the slot number used for a given field name. + + """ + return self._slots[fieldname] + + def add_prefix(self, fieldname): + """Allocate a prefix for the given field. + + If a prefix is already allocated for this field, this has no effect. + + """ + if fieldname in self._prefixes: + return + self._prefixes[fieldname] = self._genPrefix() + + def add_slot(self, fieldname): + """Allocate a slot number for the given field. + + If a slot number is already allocated for this field, this has no effect. + + """ + if fieldname in self._slots: + return + self._slots[fieldname] = self._slotcount + self._slotcount += 1 + + def serialise(self): + """Serialise the field mappings to a string. + + This can be unserialised by passing the result of this method to the + constructor of a new FieldMappings object. + + """ + return _cPickle.dumps((self._prefixes, + self._prefixcount, + self._slots, + self._slotcount, + ), 2) diff --git a/secore/highlight.py b/secore/highlight.py new file mode 100644 index 0000000..38f2050 --- /dev/null +++ b/secore/highlight.py @@ -0,0 +1,310 @@ +#!/usr/bin/env python +# +# Copyright (C) 2007 Lemur Consulting Ltd +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +r"""highlight.py: Highlight and summarise text. + +""" +__docformat__ = "restructuredtext en" + +import re +import xapian + +class Highlighter(object): + """Class for highlighting text and creating contextual summaries. + + >>> hl = Highlighter("en") + >>> hl.makeSample('Hello world.', ['world']) + 'Hello world.' + >>> hl.highlight('Hello world', ['world'], ('<', '>')) + 'Hello <world>' + + """ + + # split string into words, spaces, punctuation and markup tags + _split_re = re.compile( + '</\\w+>|<\\w+(?:\\s*\\w+="[^"]*"|\\s*\\w+)*\\s*>|[\\w\']+|\\s+|[^\\w\'\\s<>/]+') + + def __init__(self, language_code='en', stemmer=None): + """Create a new highlighter for the specified language. + + """ + if stemmer is not None: + self.stem = stemmer + else: + self.stem = xapian.Stem(language_code) + + def _split_text(self, text, strip_tags=False): + """Split some text into words and non-words. + + - `text` is the text to process. It may be a unicode object or a utf-8 + encoded simple string. + - `strip_tags` is a flag - False to keep tags, True to strip all tags + from the output. + + Returns a list of utf-8 encoded simple strings. + + """ + if isinstance(text, unicode): + text = text.encode('utf-8') + + words = self._split_re.findall(text) + if strip_tags: + return [w for w in words if w[0] != '<'] + else: + return words + + def _strip_prefix(self, term): + """Strip the prefix off a term. + + Prefixes are any initial capital letters, with the exception that R always + ends a prefix, even if followed by capital letters. + + >>> hl = Highlighter("en") + >>> print hl._strip_prefix('hello') + hello + >>> print hl._strip_prefix('Rhello') + hello + >>> print hl._strip_prefix('XARHello') + Hello + >>> print hl._strip_prefix('XAhello') + hello + >>> print hl._strip_prefix('XAh') + h + >>> print hl._strip_prefix('XA') + <BLANKLINE> + + """ + for p in xrange(len(term)): + if term[p].islower(): + return term[p:] + elif term[p] == 'R': + return term[p+1:] + return '' + + def _query_to_stemmed_words(self, query): + """Convert a query to a list of stemmed words. + + - `query` is the query to parse: it may be xapian.Query object, or a + sequence of terms. + + """ + if isinstance(query, xapian.Query): + return [self._strip_prefix(t) for t in query] + else: + return [self.stem(q.lower()) for q in query] + + + def makeSample(self, text, query, maxlen=600, hl=None): + """Make a contextual summary from the supplied text. + + This basically works by splitting the text into phrases, counting the query + terms in each, and keeping those with the most. + + Any markup tags in the text will be stripped. + + `text` is the source text to summarise. + `query` is either a Xapian query object or a list of (unstemmed) term strings. + `maxlen` is the maximum length of the generated summary. + `hl` is a pair of strings to insert around highlighted terms, e.g. ('<b>', '</b>') + + """ + + words = self._split_text(text, True) + terms = self._query_to_stemmed_words(query) + + # build blocks delimited by puncuation, and count matching words in each block + # blocks[n] is a block [firstword, endword, charcount, termcount, selected] + blocks = [] + start = end = count = blockchars = 0 + + while end < len(words): + blockchars += len(words[end]) + if words[end].isalnum(): + if self.stem(words[end].lower()) in terms: + count += 1 + end += 1 + elif words[end] in ',.;:?!\n': + end += 1 + blocks.append([start, end, blockchars, count, False]) + start = end + blockchars = 0 + count = 0 + else: + end += 1 + if start != end: + blocks.append([start, end, blockchars, count, False]) + if len(blocks) == 0: + return '' + + # select high-scoring blocks first, down to zero-scoring + chars = 0 + for count in xrange(3, -1, -1): + for b in blocks: + if b[3] >= count: + b[4] = True + chars += b[2] + if chars >= maxlen: break + if chars >= maxlen: break + + # assemble summary + words2 = [] + lastblock = -1 + for i, b in enumerate(blocks): + if b[4]: + if i != lastblock + 1: + words2.append('..') + words2.extend(words[b[0]:b[1]]) + lastblock = i + + if not blocks[-1][4]: + words2.append('..') + + # trim down to maxlen + l = 0 + for i in xrange (len (words2)): + l += len (words2[i]) + if l >= maxlen: + words2[i:] = ['..'] + break + + if hl is None: + return ''.join(words2) + else: + return self._hl(words2, terms, hl) + + def highlight(self, text, query, hl, strip_tags=False): + """Add highlights (string prefix/postfix) to a string. + + `text` is the source to highlight. + `query` is either a Xapian query object or a list of (unstemmed) term strings. + `hl` is a pair of highlight strings, e.g. ('<i>', '</i>') + `strip_tags` strips HTML markout iff True + + >>> hl = Highlighter() + >>> qp = xapian.QueryParser() + >>> q = qp.parse_query('cat dog') + >>> tags = ('[[', ']]') + >>> hl.highlight('The cat went Dogging; but was <i>dog tired</i>.', q, tags) + 'The [[cat]] went [[Dogging]]; but was <i>[[dog]] tired</i>.' + + """ + words = self._split_text(text, strip_tags) + terms = self._query_to_stemmed_words(query) + return self._hl(words, terms, hl) + + def _hl(self, words, terms, hl): + """Add highlights to a list of words. + + `words` is the list of words and non-words to be highlighted.. + `terms` is the list of stemmed words to look for. + + """ + for i, w in enumerate(words): + if self.stem(words[i].lower()) in terms: + words[i] = ''.join((hl[0], w, hl[1])) + + return ''.join(words) + + +__test__ = { + 'no_punc': r''' + + Test the highlighter's behaviour when there is no punctuation in the sample + text (regression test - used to return no output): + >>> hl = Highlighter("en") + >>> hl.makeSample('Hello world', ['world']) + 'Hello world' + + ''', + + 'stem_levels': r''' + + Test highlighting of words, and how it works with stemming: + >>> hl = Highlighter("en") + + # "word" and "wording" stem to "word", so the following 4 calls all return + # the same thing + >>> hl.makeSample('Hello. word. wording. wordinging.', ['word'], hl='<>') + 'Hello. <word>. <wording>. wordinging.' + >>> hl.highlight('Hello. word. wording. wordinging.', ['word'], '<>') + 'Hello. <word>. <wording>. wordinging.' + >>> hl.makeSample('Hello. word. wording. wordinging.', ['wording'], hl='<>') + 'Hello. <word>. <wording>. wordinging.' + >>> hl.highlight('Hello. word. wording. wordinging.', ['wording'], '<>') + 'Hello. <word>. <wording>. wordinging.' + + # "wordinging" stems to "wording", so only the last word is highlighted for + # this one. + >>> hl.makeSample('Hello. word. wording. wordinging.', ['wordinging'], hl='<>') + 'Hello. word. wording. <wordinging>.' + >>> hl.highlight('Hello. word. wording. wordinging.', ['wordinging'], '<>') + 'Hello. word. wording. <wordinging>.' + ''', + + 'supplied_stemmer': r''' + + Test behaviour if we pass in our own stemmer: + >>> stem = xapian.Stem('en') + >>> hl = Highlighter(stemmer=stem) + >>> hl.highlight('Hello. word. wording. wordinging.', ['word'], '<>') + 'Hello. <word>. <wording>. wordinging.' + + ''', + + 'unicode': r''' + + Test behaviour if we pass in unicode input: + >>> hl = Highlighter('en') + >>> hl.highlight(u'Hello\xf3. word. wording. wordinging.', ['word'], '<>') + 'Hello\xc3\xb3. <word>. <wording>. wordinging.' + + ''', + + 'no_sample': r''' + + Test behaviour if we pass in unicode input: + >>> hl = Highlighter('en') + >>> hl.makeSample(u'', ['word']) + '' + + ''', + + 'short_samples': r''' + + >>> hl = Highlighter('en') + >>> hl.makeSample("A boring start. Hello world indeed. A boring end.", ['hello'], 20, ('<', '>')) + '.. <Hello> world ..' + >>> hl.makeSample("A boring start. Hello world indeed. A boring end.", ['hello'], 40, ('<', '>')) + 'A boring start. <Hello> world indeed...' + >>> hl.makeSample("A boring start. Hello world indeed. A boring end.", ['boring'], 40, ('<', '>')) + 'A <boring> start... A <boring> end.' + + ''', + + 'apostrophes': r''' + + >>> hl = Highlighter('en') + >>> hl.makeSample("A boring start. Hello world's indeed. A boring end.", ['world'], 40, ('<', '>')) + "A boring start. Hello <world's> indeed..." + + ''', + +} + +if __name__ == '__main__': + import doctest, sys + doctest.testmod (sys.modules[__name__]) diff --git a/secore/indexerconnection.py b/secore/indexerconnection.py new file mode 100644 index 0000000..be82319 --- /dev/null +++ b/secore/indexerconnection.py @@ -0,0 +1,380 @@ +#!/usr/bin/env python +# +# Copyright (C) 2007 Lemur Consulting Ltd +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +r"""indexerconnection.py: A connection to the search engine for indexing. + +""" +__docformat__ = "restructuredtext en" + +import xapian as _xapian +from datastructures import * +from fieldactions import * +import fieldmappings as _fieldmappings +import errors as _errors +import os as _os +import cPickle as _cPickle + +class IndexerConnection(object): + """A connection to the search engine for indexing. + + """ + + def __init__(self, indexpath): + """Create a new connection to the index. + + There may only be one indexer connection for a particular database open + at a given time. Therefore, if a connection to the database is already + open, this will raise a xapian.DatabaseLockError. + + If the database doesn't already exist, it will be created. + + """ + self._index = _xapian.WritableDatabase(indexpath, _xapian.DB_CREATE_OR_OPEN) + self._indexpath = indexpath + + # Read existing actions. + self._field_actions = {} + self._field_mappings = _fieldmappings.FieldMappings() + self._next_docid = 0 + self._config_modified = False + self._load_config() + + def _store_config(self): + """Store the configuration for the database. + + Currently, this stores the configuration in a file in the database + directory, so changes to it are not protected by transactions. When + support is available in xapian for storing metadata associated with + databases. this will be used instead of a file. + + """ + config_str = _cPickle.dumps(( + self._field_actions, + self._field_mappings.serialise(), + self._next_docid, + ), 2) + config_file = _os.path.join(self._indexpath, 'config') + fd = open(config_file, "w") + fd.write(config_str) + fd.close() + self._config_modified = False + + def _load_config(self): + """Load the configuration for the database. + + """ + config_file = _os.path.join(self._indexpath, 'config') + if not _os.path.exists(config_file): + return + fd = open(config_file) + config_str = fd.read() + fd.close() + + (self._field_actions, mappings, self._next_docid) = _cPickle.loads(config_str) + self._field_mappings = _fieldmappings.FieldMappings(mappings) + self._config_modified = False + + def _allocate_id(self): + """Allocate a new ID. + + """ + while True: + idstr = "%x" % self._next_docid + self._next_docid += 1 + if not self._index.term_exists('Q' + idstr): + break + self._config_modified = True + return idstr + + def add_field_action(self, fieldname, fieldtype, **kwargs): + """Add an action to be performed on a field. + + Note that this change to the configuration will not be preserved on + disk until the next call to flush(). + + """ + if self._index is None: + raise _errors.IndexerError("IndexerConnection has been closed") + if fieldname in self._field_actions: + actions = self._field_actions[fieldname] + else: + actions = FieldActions(fieldname) + self._field_actions[fieldname] = actions + actions.add(self._field_mappings, fieldtype, **kwargs) + self._config_modified = True + + def clear_field_actions(self, fieldname): + """Clear all actions for the specified field. + + This does not report an error if there are already no actions for the + specified field. + + Note that this change to the configuration will not be preserved on + disk until the next call to flush(). + + """ + if self._index is None: + raise _errors.IndexerError("IndexerConnection has been closed") + if fieldname in self._field_actions: + del self._field_actions[fieldname] + self._config_modified = True + + def process(self, document): + """Process an UnprocessedDocument with the settings in this database. + + The resulting ProcessedDocument is returned. + + Note that this processing will be automatically performed if an + UnprocessedDocument is supplied to the add() or replace() methods of + IndexerConnection. This method is exposed to allow the processing to + be performed separately, which may be desirable if you wish to manually + modify the processed document before adding it to the database, or if + you want to split processing of documents from adding documents to the + database for performance reasons. + + """ + if self._index is None: + raise _errors.IndexerError("IndexerConnection has been closed") + result = ProcessedDocument(self._field_mappings) + result.id = document.id + context = ActionContext(self._index) + + for field in document.fields: + try: + actions = self._field_actions[field.name] + except KeyError: + # If no actions are defined, just ignore the field. + continue + actions.perform(result, field.value, context) + + return result + + def add(self, document): + """Add a new document to the search engine index. + + If the document has a id set, and the id already exists in + the database, an exception will be raised. Use the replace() method + instead if you wish to overwrite documents. + + Returns the id of the newly added document (making up a new + unique ID if no id was set). + + The supplied document may be an instance of UnprocessedDocument, or an + instance of ProcessedDocument. + + """ + if self._index is None: + raise _errors.IndexerError("IndexerConnection has been closed") + if not hasattr(document, '_doc'): + # It's not a processed document. + document = self.process(document) + + # Ensure that we have a id + orig_id = document.id + if orig_id is None: + id = self._allocate_id() + document.id = id + else: + id = orig_id + if self._index.term_exists('Q' + id): + raise _errors.IndexerError("Document ID of document supplied to add() is not unique.") + + # Add the document. + xapdoc = document.prepare() + self._index.add_document(xapdoc) + + if id is not orig_id: + document.id = orig_id + return id + + def replace(self, document): + """Replace a document in the search engine index. + + If the document does not have a id set, an exception will be + raised. + + If the document has a id set, and the id does not already + exist in the database, this method will have the same effect as add(). + + """ + if self._index is None: + raise _errors.IndexerError("IndexerConnection has been closed") + if not hasattr(document, '_doc'): + # It's not a processed document. + document = self.process(document) + + # Ensure that we have a id + id = document.id + if id is None: + raise _errors.IndexerError("No document ID set for document supplied to replace().") + + xapdoc = document.prepare() + self._index.replace_document('Q' + id, xapdoc) + + def delete(self, id): + """Delete a document from the search engine index. + + If the id does not already exist in the database, this method + will have no effect (and will not report an error). + + """ + if self._index is None: + raise _errors.IndexerError("IndexerConnection has been closed") + self._index.delete_document('Q' + id) + + def flush(self): + """Apply recent changes to the database. + + If an exception occurs, any changes since the last call to flush() may + be lost. + + """ + if self._index is None: + raise _errors.IndexerError("IndexerConnection has been closed") + if self._config_modified: + self._store_config() + self._index.flush() + + def close(self): + """Close the connection to the database. + + It is important to call this method before allowing the class to be + garbage collected, because it will ensure that any un-flushed changes + will be flushed. It also ensures that the connection is cleaned up + promptly. + + No other methods may be called on the connection after this has been + called. (It is permissible to call close() multiple times, but + only the first call will have any effect.) + + If an exception occurs, the database will be closed, but changes since + the last call to flush may be lost. + + """ + if self._index is None: + return + try: + self.flush() + finally: + # There is currently no "close()" method for xapian databases, so + # we have to rely on the garbage collector. Since we never copy + # the _index property out of this class, there should be no cycles, + # so the standard python implementation should garbage collect + # _index straight away. A close() method is planned to be added to + # xapian at some point - when it is, we should call it here to make + # the code more robust. + self._index = None + self._indexpath = None + self._field_actions = None + self._config_modified = False + + def get_doccount(self): + """Count the number of documents in the database. + + This count will include documents which have been added or removed but + not yet flushed(). + + """ + if self._index is None: + raise _errors.IndexerError("IndexerConnection has been closed") + return self._index.get_doccount() + + def iterids(self): + """Get an iterator which returns all the ids in the database. + + The unqiue_ids are currently returned in binary lexicographical sort + order, but this should not be relied on. + + """ + if self._index is None: + raise _errors.IndexerError("IndexerConnection has been closed") + return PrefixedTermIter('Q', self._index.allterms()) + + def get_document(self, id): + """Get the document with the specified unique ID. + + Raises a KeyError if there is no such document. Otherwise, it returns + a ProcessedDocument. + + """ + if self._index is None: + raise _errors.IndexerError("IndexerConnection has been closed") + postlist = self._index.postlist('Q' + id) + try: + plitem = postlist.next() + except StopIteration: + # Unique ID not found + raise KeyError('Unique ID %r not found' % id) + try: + postlist.next() + raise _errors.IndexerError("Multiple documents " #pragma: no cover + "found with same unique ID") + except StopIteration: + # Only one instance of the unique ID found, as it should be. + pass + + result = ProcessedDocument(self._field_mappings) + result.id = id + result._doc = self._index.get_document(plitem.docid) + return result + +class PrefixedTermIter(object): + """Iterate through all the terms with a given prefix. + + """ + def __init__(self, prefix, termiter): + """Initialise the prefixed term iterator. + + - `prefix` is the prefix to return terms for. + - `termiter` is a xapian TermIterator, which should be at it's start. + + """ + + # The algorithm used in next() currently only works for single + # character prefixes, so assert that the prefix is single character. + # To deal with multicharacter prefixes, we need to check for terms + # which have a starting prefix equal to that given, but then have a + # following uppercase alphabetic character, indicating that the actual + # prefix is longer than the target prefix. We then need to skip over + # these. Not too hard to implement, but we don't need it yet. + assert(len(prefix) == 1) + + self._started = False + self._prefix = prefix + self._prefixlen = len(prefix) + self._termiter = termiter + + def __iter__(self): + return self + + def next(self): + """Get the next term with the specified prefix. + + + """ + if not self._started: + term = self._termiter.skip_to(self._prefix).term + self._started = True + else: + term = self._termiter.next().term + if len(term) < self._prefixlen or term[:self._prefixlen] != self._prefix: + raise StopIteration + return term[self._prefixlen:] + +if __name__ == '__main__': + import doctest, sys + doctest.testmod (sys.modules[__name__]) diff --git a/secore/marshall.py b/secore/marshall.py new file mode 100644 index 0000000..ebcc71d --- /dev/null +++ b/secore/marshall.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python +# +# Copyright (C) 2007 Lemur Consulting Ltd +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +r"""marshall.py: Marshal values into strings + +""" +__docformat__ = "restructuredtext en" + +import math + +def _long_to_base256_array(value, length, flip): + result = [] + for i in xrange(length): + n = value % 256 + if flip: n = 255 - n + result.insert(0, chr(n)) + value /= 256 + return result + +def float_to_string(value): + """Marshall a floating point number to a string which sorts in the + appropriate manner. + + """ + mantissa, exponent = math.frexp(value) + sign = '1' + if mantissa < 0: + mantissa = -mantissa + sign = '0' + + # IEEE representation of doubles uses 11 bits for the exponent, with a bias + # of 1023. There's then another 52 bits in the mantissa, so we need to + # add 1075 to be sure that the exponent won't be negative. + # Even then, we check that the exponent isn't negative, and consider the + # value to be equal to zero if it is. + exponent += 1075 + if exponent < 0: # Note - this can't happen on most architectures #pragma: no cover + exponent = 0 + mantissa = 0 + elif mantissa == 0: + exponent = 0 + + # IEEE representation of doubles uses 52 bits for the mantissa. Convert it + # to a 7 character string, and convert the exponent to a 2 character + # string. + + mantissa = long(mantissa * (2**52)) + + digits = [sign] + digits.extend(_long_to_base256_array(exponent, 2, sign == '0')) + digits.extend(_long_to_base256_array(mantissa, 7, sign == '0')) + + return ''.join(digits) + +def date_to_string(date): + """Marshall a date to a string which sorts in the appropriate manner. + + """ + return '%04d%02d%02d' % (date.year, date.month, date.day) diff --git a/secore/parsedate.py b/secore/parsedate.py new file mode 100644 index 0000000..684d5f2 --- /dev/null +++ b/secore/parsedate.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# +# Copyright (C) 2007 Lemur Consulting Ltd +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +r"""parsedate.py: Parse date strings. + +""" +__docformat__ = "restructuredtext en" + +import datetime +import re + +yyyymmdd_re = re.compile(r'(?P<year>[0-9]{4})(?P<month>[0-9]{2})(?P<day>[0-9]{2})$') +yyyy_mm_dd_re = re.compile(r'(?P<year>[0-9]{4})([-/.])(?P<month>[0-9]{2})\2(?P<day>[0-9]{2})$') + +def date_from_string(value): + """Parse a string into a date. + + If the value supplied is already a date-like object (ie, has 'year', + 'month' and 'day' attributes), it is returned without processing. + + Supported date formats are: + + - YYYYMMDD + - YYYY-MM-DD + - YYYY/MM/DD + - YYYY.MM.DD + + """ + if (hasattr(value, 'year') + and hasattr(value, 'month') + and hasattr(value, 'day')): + return value + + mg = yyyymmdd_re.match(value) + if mg is None: + mg = yyyy_mm_dd_re.match(value) + + if mg is not None: + year, month, day = (int(i) for i in mg.group('year', 'month', 'day')) + return datetime.date(year, month, day) + + raise ValueError('Unrecognised date format') diff --git a/secore/searchconnection.py b/secore/searchconnection.py new file mode 100644 index 0000000..79fa509 --- /dev/null +++ b/secore/searchconnection.py @@ -0,0 +1,618 @@ +#!/usr/bin/env python +# +# Copyright (C) 2007 Lemur Consulting Ltd +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +r"""searchconnection.py: A connection to the search engine for searching. + +""" +__docformat__ = "restructuredtext en" + +import xapian as _xapian +from datastructures import * +from fieldactions import * +import fieldmappings as _fieldmappings +import highlight as _highlight +import errors as _errors +import os as _os +import cPickle as _cPickle + +class SearchResult(ProcessedDocument): + """A result from a search. + + """ + def __init__(self, msetitem, results): + ProcessedDocument.__init__(self, results._fieldmappings, msetitem.document) + self.rank = msetitem.rank + self._results = results + + def _get_language(self, field): + """Get the language that should be used for a given field. + + """ + actions = self._results._conn._field_actions[field]._actions + for action, kwargslist in actions.iteritems(): + if action == FieldActions.INDEX_FREETEXT: + for kwargs in kwargslist: + try: + return kwargs['language'] + except KeyError: + pass + return 'none' + + def summarise(self, field, maxlen=600, hl=('<b>', '</b>')): + """Return a summarised version of the field specified. + + This will return a summary of the contents of the field stored in the + search result, with words which match the query highlighted. + + The maximum length of the summary (in characters) may be set using the + maxlen parameter. + + The return value will be a string holding the summary, with + highlighting applied. If there are multiple instances of the field in + the document, the instances will be joined with a newline character. + + To turn off highlighting, set hl to None. Each highlight will consist + of the first entry in the `hl` list being placed before the word, and + the second entry in the `hl` list being placed after the word. + + Any XML or HTML style markup tags in the field will be stripped before + the summarisation algorithm is applied. + + """ + highlighter = _highlight.Highlighter(language_code=self._get_language(field)) + field = self.data[field] + results = [] + text = '\n'.join(field) + return highlighter.makeSample(text, self._results._query, maxlen, hl) + + def highlight(self, field, hl=('<b>', '</b>'), strip_tags=False): + """Return a highlighted version of the field specified. + + This will return all the contents of the field stored in the search + result, with words which match the query highlighted. + + The return value will be a list of strings (corresponding to the list + of strings which is the raw field data). + + Each highlight will consist of the first entry in the `hl` list being + placed before the word, and the second entry in the `hl` list being + placed after the word. + + If `strip_tags` is True, any XML or HTML style markup tags in the field + will be stripped before highlighting is applied. + + """ + highlighter = _highlight.Highlighter(language_code=self._get_language(field)) + field = self.data[field] + results = [] + for text in field: + results.append(highlighter.highlight(text, self._results._query, hl, strip_tags)) + return results + + def __repr__(self): + return ('<SearchResult(rank=%d, id=%r, data=%r)>' % + (self.rank, self.id, self.data)) + + +class SearchResultIter(object): + """An iterator over a set of results from a search. + + """ + def __init__(self, results): + self._results = results + self._iter = iter(results._mset) + + def next(self): + msetitem = self._iter.next() + return SearchResult(msetitem, + self._results) + + +class SearchResults(object): + """A set of results of a search. + + """ + def __init__(self, conn, enq, query, mset, fieldmappings): + self._conn = conn + self._enq = enq + self._query = query + self._mset = mset + self._fieldmappings = fieldmappings + + def __repr__(self): + return ("<SearchResults(startrank=%d, " + "endrank=%d, " + "more_matches=%s, " + "matches_lower_bound=%d, " + "matches_upper_bound=%d, " + "matches_estimated=%d, " + "estimate_is_exact=%s)>" % + ( + self.startrank, + self.endrank, + self.more_matches, + self.matches_lower_bound, + self.matches_upper_bound, + self.matches_estimated, + self.estimate_is_exact, + )) + + def _get_more_matches(self): + # This check relies on us having asked for at least one more result + # than retrieved to be checked. + return (self.matches_lower_bound > self.endrank) + more_matches = property(_get_more_matches, doc= + """Check whether there are further matches after those in this result set. + + """) + def _get_startrank(self): + return self._mset.get_firstitem() + startrank = property(_get_startrank, doc= + """Get the rank of the first item in the search results. + + This corresponds to the "startrank" parameter passed to the search() method. + + """) + def _get_endrank(self): + return self._mset.get_firstitem() + len(self._mset) + endrank = property(_get_endrank, doc= + """Get the rank of the item after the end of the search results. + + If there are sufficient results in the index, this corresponds to the + "endrank" parameter passed to the search() method. + + """) + def _get_lower_bound(self): + return self._mset.get_matches_lower_bound() + matches_lower_bound = property(_get_lower_bound, doc= + """Get a lower bound on the total number of matching documents. + + """) + def _get_upper_bound(self): + return self._mset.get_matches_upper_bound() + matches_upper_bound = property(_get_upper_bound, doc= + """Get an upper bound on the total number of matching documents. + + """) + def _get_estimated(self): + return self._mset.get_matches_estimated() + matches_estimated = property(_get_estimated, doc= + """Get an estimate for the total number of matching documents. + + """) + def _estimate_is_exact(self): + return self._mset.get_matches_lower_bound() == \ + self._mset.get_matches_upper_bound() + estimate_is_exact = property(_estimate_is_exact, doc= + """Check whether the estimated number of matching documents is exact. + + If this returns true, the estimate given by the `matches_estimated` + property is guaranteed to be correct. + + If this returns false, it is possible that the actual number of matching + documents is different from the number given by the `matches_estimated` + property. + + """) + + def get_hit(self, index): + """Get the hit with a given index. + + """ + msetitem = self._mset.get_hit(index) + return SearchResult(msetitem, self) + __getitem__ = get_hit + + def __iter__(self): + """Get an iterator over the hits in the search result. + + The iterator returns the results in increasing order of rank. + + """ + return SearchResultIter(self) + +class SearchConnection(object): + """A connection to the search engine for searching. + + The connection will access a view of the database. + + """ + + def __init__(self, indexpath): + """Create a new connection to the index for searching. + + There may only an arbitrary number of search connections for a + particular database open at a given time (regardless of whether there + is a connection for indexing open as well). + + If the database doesn't exist, an exception will be raised. + + """ + self._index = _xapian.Database(indexpath) + self._indexpath = indexpath + + # Read the actions. + self._load_config() + + def _get_sort_type(self, field): + """Get the sort type that should be used for a given field. + + """ + actions = self._field_actions[field]._actions + for action, kwargslist in actions.iteritems(): + if action == FieldActions.SORT_AND_COLLAPSE: + for kwargs in kwargslist: + return kwargs['type'] + + def _load_config(self): + """Load the configuration for the database. + + """ + # Note: this code is basically duplicated in the IndexerConnection + # class. Move it to a shared location. + config_file = _os.path.join(self._indexpath, 'config') + if not _os.path.exists(config_file): + self._field_mappings = _fieldmappings.FieldMappings() + return + fd = open(config_file) + config_str = fd.read() + fd.close() + + (self._field_actions, mappings, next_docid) = _cPickle.loads(config_str) + self._field_mappings = _fieldmappings.FieldMappings(mappings) + + def reopen(self): + """Reopen the connection. + + This updates the revision of the index which the connection references + to the latest flushed revision. + + """ + if self._index is None: + raise _errors.SearchError("SearchConnection has been closed") + self._index.reopen() + # Re-read the actions. + self._load_config() + + def close(self): + """Close the connection to the database. + + It is important to call this method before allowing the class to be + garbage collected to ensure that the connection is cleaned up promptly. + + No other methods may be called on the connection after this has been + called. (It is permissible to call close() multiple times, but + only the first call will have any effect.) + + If an exception occurs, the database will be closed, but changes since + the last call to flush may be lost. + + """ + if self._index is None: + return + # There is currently no "close()" method for xapian databases, so + # we have to rely on the garbage collector. Since we never copy + # the _index property out of this class, there should be no cycles, + # so the standard python implementation should garbage collect + # _index straight away. A close() method is planned to be added to + # xapian at some point - when it is, we should call it here to make + # the code more robust. + self._index = None + self._indexpath = None + self._field_actions = None + self._field_mappings = None + + def get_doccount(self): + """Count the number of documents in the database. + + This count will include documents which have been added or removed but + not yet flushed(). + + """ + if self._index is None: + raise _errors.SearchError("SearchConnection has been closed") + return self._index.get_doccount() + + def get_document(self, id): + """Get the document with the specified unique ID. + + Raises a KeyError if there is no such document. Otherwise, it returns + a ProcessedDocument. + + """ + if self._index is None: + raise _errors.SearchError("SearchConnection has been closed") + postlist = self._index.postlist('Q' + id) + try: + plitem = postlist.next() + except StopIteration: + # Unique ID not found + raise KeyError('Unique ID %r not found' % id) + try: + postlist.next() + raise _errors.SearchError("Multiple documents " #pragma: no cover + "found with same unique ID") + except StopIteration: + # Only one instance of the unique ID found, as it should be. + pass + + result = ProcessedDocument(self._field_mappings) + result.id = id + result._doc = self._index.get_document(plitem.docid) + return result + + OP_AND = _xapian.Query.OP_AND + OP_OR = _xapian.Query.OP_OR + def query_composite(self, operator, queries): + """Build a composite query from a list of queries. + + The queries are combined with the supplied operator, which is either + SearchConnection.OP_AND or SearchConnection.OP_OR. + + """ + if self._index is None: + raise _errors.SearchError("SearchConnection has been closed") + return _xapian.Query(operator, list(queries)) + + def query_filter(self, query, filter): + """Filter a query with another query. + + Documents will only match the resulting query if they match both + queries, but will be weighted according to only the first query. + + - `query`: The query to filter. + - `filter`: The filter to apply to the query. + + """ + if self._index is None: + raise _errors.SearchError("SearchConnection has been closed") + if not isinstance(filter, _xapian.Query): + raise _errors.SearchError("Filter must be a Xapian Query object") + return _xapian.Query(_xapian.Query.OP_FILTER, query, filter) + + def query_range(self, field, begin, end): + """Create a query for a range search. + + This creates a query which matches only those documents which have a + field value in the specified range. + + Begin and end must be appropriate values for the field, according to + the 'type' parameter supplied to the SORTABLE action for the field. + + The begin and end values are both inclusive - any documents with a + value equal to begin or end will be returned (unless end is less than + begin, in which case no documents will be returned). + + """ + if self._index is None: + raise _errors.SearchError("SearchConnection has been closed") + + sorttype = self._get_sort_type(field) + marshaller = SortableMarshaller(False) + fn = marshaller.get_marshall_function(field, sorttype) + begin = fn(field, begin) + end = fn(field, end) + + slot = self._field_mappings.get_slot(field) + return _xapian.Query(_xapian.Query.OP_VALUE_RANGE, slot, begin, end) + + def _prepare_queryparser(self, allow, deny, default_op): + """Prepare (and return) a query parser using the specified fields and + operator. + + """ + if self._index is None: + raise _errors.SearchError("SearchConnection has been closed") + if allow is not None and deny is not None: + raise _errors.SearchError("Cannot specify both `allow` and `deny`") + qp = _xapian.QueryParser() + qp.set_database(self._index) + qp.set_default_op(default_op) + + if allow is None: + allow = [key for key in self._field_actions] + if deny is not None: + allow = [key for key in allow if key not in deny] + + for field in allow: + actions = self._field_actions[field]._actions + for action, kwargslist in actions.iteritems(): + if action == FieldActions.INDEX_EXACT: + # FIXME - need patched version of xapian to add exact prefixes + #qp.add_exact_prefix(field, self._field_mappings.get_prefix(field)) + qp.add_prefix(field, self._field_mappings.get_prefix(field)) + if action == FieldActions.INDEX_FREETEXT: + qp.add_prefix(field, self._field_mappings.get_prefix(field)) + for kwargs in kwargslist: + try: + lang = kwargs['language'] + qp.set_stemmer(_xapian.Stem(lang)) + qp.set_stemming_strategy(qp.STEM_SOME) + except KeyError: + pass + return qp + + def query_parse(self, string, allow=None, deny=None, default_op=OP_AND): + """Parse a query string. + + This is intended for parsing queries entered by a user. If you wish to + combine structured queries, it is generally better to use the other + query building methods, such as `query_composite`. + + - `string`: The string to parse. + - `allow`: A list of fields to allow in the query. + - `deny`: A list of fields not to allow in the query. + + Only one of `allow` and `deny` may be specified. + + If any of the entries in `allow` or `deny` are not present in the + configuration for the database, an exception will be raised. + + Returns a Query object, which may be passed to the search() method, or + combined with other queries. + + """ + qp = self._prepare_queryparser(allow, deny, default_op) + try: + return qp.parse_query(string) + except _xapian.QueryParserError, e: + # If we got a parse error, retry without boolean operators (since + # these are the usual cause of the parse error). + return qp.parse_query(string, 0) + + def query_field(self, field, value, default_op=OP_AND): + """A query for a single field. + + """ + if self._index is None: + raise _errors.SearchError("SearchConnection has been closed") + try: + actions = self._field_actions[field]._actions + except KeyError: + actions = {} + + # need to check on field type, and stem / split as appropriate + for action, kwargslist in actions.iteritems(): + if action == FieldActions.INDEX_EXACT: + prefix = self._field_mappings.get_prefix(field) + if len(value) > 0: + chval = ord(value[0]) + if chval >= ord('A') and chval <= ord('Z'): + prefix = prefix + ':' + return _xapian.Query(prefix + value) + if action == FieldActions.INDEX_FREETEXT: + qp = _xapian.QueryParser() + qp.set_default_op(default_op) + prefix = self._field_mappings.get_prefix(field) + for kwargs in kwargslist: + try: + lang = kwargs['language'] + qp.set_stemmer(_xapian.Stem(lang)) + qp.set_stemming_strategy(qp.STEM_SOME) + except KeyError: + pass + return qp.parse_query(value, + qp.FLAG_PHRASE | qp.FLAG_BOOLEAN | qp.FLAG_LOVEHATE, + prefix) + + return _xapian.Query() + + def query_all(self): + """A query which matches all the documents in the database. + + """ + return _xapian.Query('') + + def spell_correct(self, string, allow=None, deny=None): + """Correct a query spelling. + + This returns a version of the query string with any misspelt words + corrected. + + - `allow`: A list of fields to allow in the query. + - `deny`: A list of fields not to allow in the query. + + Only one of `allow` and `deny` may be specified. + + If any of the entries in `allow` or `deny` are not present in the + configuration for the database, an exception will be raised. + + """ + qp = self._prepare_queryparser(allow, deny, self.OP_AND) + qp.parse_query(string, qp.FLAG_PHRASE|qp.FLAG_BOOLEAN|qp.FLAG_LOVEHATE|qp.FLAG_SPELLING_CORRECTION) + corrected = qp.get_corrected_query_string() + if len(corrected) == 0: + if isinstance(string, unicode): + # Encode as UTF-8 for consistency - this happens automatically + # to values passed to Xapian. + return string.encode('utf-8') + return string + return corrected + + def search(self, query, startrank, endrank, + checkatleast=0, sortby=None, collapse=None): + """Perform a search, for documents matching a query. + + - `query` is the query to perform. + - `startrank` is the rank of the start of the range of matching + documents to return (ie, the result with this rank will be returned). + ranks start at 0, which represents the "best" matching document. + - `endrank` is the rank at the end of the range of matching documents + to return. This is exclusive, so the result with this rank will not + be returned. + - `checkatleast` is the minimum number of results to check for: the + estimate of the total number of matches will always be exact if + the number of matches is less than `checkatleast`. + - `sortby` is the name of a field to sort by. It may be preceded by a + '+' or a '-' to indicate ascending or descending order + (respectively). If the first character is neither '+' or '-', the + sort will be in ascending order. + - `collapse` is the name of a field to collapse the result documents + on. If this is specified, there will be at most one result in the + result set for each value of the field. + + """ + if self._index is None: + raise _errors.SearchError("SearchConnection has been closed") + enq = _xapian.Enquire(self._index) + enq.set_query(query) + + if sortby is not None: + asc = True + if sortby[0] == '-': + asc = False + sortby = sortby[1:] + elif sortby[0] == '+': + sortby = sortby[1:] + + try: + slotnum = self._field_mappings.get_slot(sortby) + except KeyError: + raise _errors.SearchError("Field %r was not indexed for sorting" % sortby) + + # Note: we invert the "asc" parameter, because xapian treats + # "ascending" as meaning "higher values are better"; in other + # words, it considers "ascending" to mean return results in + # descending order. + enq.set_sort_by_value_then_relevance(slotnum, not asc) + + if collapse is not None: + try: + slotnum = self._field_mappings.get_slot(collapse) + except KeyError: + raise _errors.SearchError("Field %r was not indexed for collapsing" % collapse) + enq.set_collapse_key(slotnum) + + maxitems = max(endrank - startrank, 0) + # Always check for at least one more result, so we can report whether + # there are more matches. + checkatleast = max(checkatleast, endrank + 1) + + enq.set_docid_order(enq.DONT_CARE) + + # Repeat the search until we don't get a DatabaseModifiedError + while True: + try: + mset = enq.get_mset(startrank, maxitems, checkatleast) + break + except _xapian.DatabaseModifiedError, e: + self.reopen() + return SearchResults(self, enq, query, mset, self._field_mappings) + +if __name__ == '__main__': + import doctest, sys + doctest.testmod (sys.modules[__name__]) |