#!/usr/bin/env python # # Copyright (C) 2007 Lemur Consulting Ltd # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. r"""datastructures.py: Datastructures for search engine core. """ __docformat__ = "restructuredtext en" import xapian as _xapian import cPickle as _cPickle class Field(object): # Use __slots__ because we're going to have very many Field objects in # typical usage. __slots__ = 'name', 'value' def __init__(self, name, value): self.name = name self.value = value def __repr__(self): return 'Field(%r, %r)' % (self.name, self.value) class UnprocessedDocument(object): """A unprocessed document to be passed to the indexer. This represents an item to be processed and stored in the search engine. Each document will be processed by the indexer to generate a ProcessedDocument, which can then be stored in the search engine index. Note that some information in an UnprocessedDocument will not be represented in the ProcessedDocument: therefore, it is not possible to retrieve an UnprocessedDocument from the search engine index. An unprocessed document is a simple container with two attributes: - `fields` is a list of Field objects. - `id` is a string holding a unique identifier for the document (or None to get the database to allocate a unique identifier automatically when the document is added). """ __slots__ = 'id', 'fields', def __init__(self, id=None, fields=None): self.id = id if fields is None: self.fields = [] else: self.fields = fields def __repr__(self): return 'UnprocessedDocument(%r, %r)' % (self.id, self.fields) class ProcessedDocument(object): """A processed document, as stored in the index. This represents an item which is ready to be stored in the search engine, or which has been returned by the search engine. """ __slots__ = '_doc', '_fieldmappings', '_data', def __init__(self, fieldmappings, xapdoc=None): """Create a ProcessedDocument. `fieldmappings` is the configuration from a database connection used lookup the configuration to use to store each field. If supplied, `xapdoc` is a Xapian document to store in the processed document. Otherwise, a new Xapian document is created. """ if xapdoc is None: self._doc = _xapian.Document() else: self._doc = xapdoc self._fieldmappings = fieldmappings self._data = None def add_term(self, field, term, wdfinc=1, positions=None): """Add a term to the document. Terms are the main unit of information used for performing searches. - `field` is the field to add the term to. - `term` is the term to add. - `wdfinc` is the value to increase the within-document-frequency measure for the term by. - `positions` is the positional information to add for the term. This may be None to indicate that there is no positional information, or may be an integer to specify one position, or may be a sequence of integers to specify several positions. (Note that the wdf is not increased automatically for each position: if you add a term at 7 positions, and the wdfinc value is 2, the total wdf for the term will only be increased by 2, not by 14.) """ prefix = self._fieldmappings.get_prefix(field) if len(term) > 0: # We use the following check, rather than "isupper()" to ensure # that we match the check performed by the queryparser, regardless # of our locale. if ord(term[0]) >= ord('A') and ord(term[0]) <= ord('Z'): prefix = prefix + ':' if positions is None: self._doc.add_term(prefix + term, wdfinc) elif isinstance(positions, int): self._doc.add_posting(prefix + term, positions, wdfinc) else: self._doc.add_term(prefix + term, wdfinc) for pos in positions: self._doc.add_posting(prefix + term, pos, 0) def add_value(self, field, value): """Add a value to the document. Values are additional units of information used when performing searches. Note that values are _not_ intended to be used to store information for display in the search results - use the document data for that. The intention is that as little information as possible is stored in values, so that they can be accessed as quickly as possible during the search operation. Unlike terms, each document may have at most one value in each field (whereas there may be an arbitrary number of terms in a given field). If an attempt to add multiple values to a single field is made, only the last value added will be stored. """ slot = self._fieldmappings.get_slot(field) self._doc.add_value(slot, value) def get_value(self, field): """Get a value from the document. """ slot = self._fieldmappings.get_slot(field) return self._doc.get_value(slot) def prepare(self): """Prepare the document for adding to a xapian database. This updates the internal xapian document with any changes which have been made, and then returns it. """ if self._data is not None: self._doc.set_data(_cPickle.dumps(self._data, 2)) self._data = None return self._doc def _get_data(self): if self._data is None: rawdata = self._doc.get_data() if rawdata == '': self._data = {} else: self._data = _cPickle.loads(rawdata) return self._data def _set_data(self, data): if not isinstance(data, dict): raise TypeError("Cannot set data to any type other than a dict") self._data = data data = property(_get_data, _set_data, doc= """The data stored in this processed document. This data is a dictionary of entries, where the key is a fieldname, and the value is a list of strings. """) def _get_id(self): tl = self._doc.termlist() try: term = tl.skip_to('Q').term if len(term) == 0 or term[0] != 'Q': return None except StopIteration: return None return term[1:] def _set_id(self, id): tl = self._doc.termlist() try: term = tl.skip_to('Q').term except StopIteration: term = '' if len(term) != 0 and term[0] == 'Q': self._doc.remove_term(term) if id is not None: self._doc.add_term('Q' + id, 0) id = property(_get_id, _set_id, doc= """The unique ID for this document. """) def __repr__(self): return '' % (self.id) if __name__ == '__main__': import doctest, sys doctest.testmod (sys.modules[__name__])