Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBenjamin Saller <bcsaller@objectrealms.net>2007-07-30 04:35:42 (GMT)
committer Benjamin Saller <bcsaller@objectrealms.net>2007-07-30 04:35:42 (GMT)
commita31cd062c96fbbd337d181a33b3890d8c2b5f32b (patch)
treee94b10b55d72b00fc5dbcf7658a2cfdc8c95549c
parent4a4283978ffd0e517ef49ccd78704ff328de1393 (diff)
updated secore to latest
-rw-r--r--secore/datastructures.py8
-rw-r--r--secore/fieldactions.py46
-rw-r--r--secore/fieldmappings.py11
-rw-r--r--secore/indexerconnection.py118
-rw-r--r--secore/marshall.py40
-rw-r--r--secore/searchconnection.py343
6 files changed, 495 insertions, 71 deletions
diff --git a/secore/datastructures.py b/secore/datastructures.py
index 414625d..b7061fa 100644
--- a/secore/datastructures.py
+++ b/secore/datastructures.py
@@ -20,6 +20,7 @@ r"""datastructures.py: Datastructures for search engine core.
"""
__docformat__ = "restructuredtext en"
+import errors as _errors
import xapian as _xapian
import cPickle as _cPickle
@@ -117,6 +118,13 @@ class ProcessedDocument(object):
# of our locale.
if ord(term[0]) >= ord('A') and ord(term[0]) <= ord('Z'):
prefix = prefix + ':'
+
+ if len(prefix + term) > 220:
+ raise _errors.IndexerError("Field %r is too long: maximum length "
+ "220 - was %d (%r)" %
+ (field, len(prefix + term),
+ prefix + term))
+
if positions is None:
self._doc.add_term(prefix + term, wdfinc)
elif isinstance(positions, int):
diff --git a/secore/fieldactions.py b/secore/fieldactions.py
index c595f0b..3de7cc9 100644
--- a/secore/fieldactions.py
+++ b/secore/fieldactions.py
@@ -42,6 +42,22 @@ def _act_index_exact(fieldname, doc, value, context):
"""
doc.add_term(fieldname, value, 0)
+def _act_tag(fieldname, doc, value, context):
+ """Perform the TAG action.
+
+ """
+ doc.add_term(fieldname, value.lower(), 0)
+
+def _act_facet(fieldname, doc, value, context, type=None):
+ """Perform the FACET action.
+
+ """
+ marshaller = SortableMarshaller()
+ fn = marshaller.get_marshall_function(fieldname, type)
+ doc.add_term(fieldname, value.lower(), 0)
+ value = fn(fieldname, value)
+ doc.add_value(fieldname, value)
+
def _act_index_freetext(fieldname, doc, value, context, weight=1,
language=None, stop=None, spell=False,
nopos=False, noprefix=False):
@@ -210,7 +226,7 @@ class FieldActions(object):
- 'string' - sort in lexicographic (ie, alphabetical) order.
This is the default, used if no type is set.
- 'float' - treat the values as (decimal representations of) floating
- point numbers, and sort in numerical order . The values in the field
+ point numbers, and sort in numerical order. The values in the field
must be valid floating point numbers (according to Python's float()
function).
- 'date' - sort in date order. The values must be valid dates (either
@@ -221,6 +237,23 @@ class FieldActions(object):
"collapse" result sets, such that only the highest result with each value
of the field will be returned.
+ - `TAG`: the field contains tags; these are strings, which will be matched
+ in a case insensitive way, but otherwise must be exact matches. Tag
+ fields can be searched for by making an explict query (ie, using
+ query_field(), but not with query_parse()). A list of the most frequent
+ tags in a result set can also be accessed easily.
+
+ - `FACET`: the field represents a classification facet; these are strings
+ which will be matched exactly, but a list of all the facets present in
+ the result set can also be accessed easily - in addition, a suitable
+ subset of the facets, and a selection of the facet values, present in the
+ result set can be calculated. One optional parameter may be supplied:
+
+ - 'type' is a value indicating the type of facet contained in the field:
+
+ - 'string' - the facet values are exact binary strings.
+ - 'float' - the facet values are floating point numbers.
+
"""
# See the class docstring for the meanings of the following constants.
@@ -229,6 +262,8 @@ class FieldActions(object):
INDEX_FREETEXT = 3
SORTABLE = 4
COLLAPSE = 5
+ TAG = 6
+ FACET = 7
# Sorting and collapsing store the data in a value, but the format depends
# on the sort type. Easiest way to implement is to treat them as the same
@@ -253,7 +288,10 @@ class FieldActions(object):
FieldActions.INDEX_EXACT,
FieldActions.INDEX_FREETEXT,
FieldActions.SORTABLE,
- FieldActions.COLLAPSE,):
+ FieldActions.COLLAPSE,
+ FieldActions.TAG,
+ FieldActions.FACET,
+ ):
raise _errors.IndexerError("Unknown field action: %r" % action)
info = self._action_info[action]
@@ -312,7 +350,7 @@ class FieldActions(object):
raise _errors.IndexerError("Field %r is already marked for "
"sorting, with a different "
"sort type" % self._fieldname)
-
+
if self.NEED_PREFIX in info[3]:
field_mappings.add_prefix(self._fieldname)
if self.NEED_SLOT in info[3]:
@@ -351,6 +389,8 @@ class FieldActions(object):
SORTABLE: ('SORTABLE', ('type', ), None, (NEED_SLOT,), ),
COLLAPSE: ('COLLAPSE', (), None, (NEED_SLOT,), ),
SORT_AND_COLLAPSE: ('SORT_AND_COLLAPSE', ('type', ), _act_sort_and_collapse, (NEED_SLOT,), ),
+ TAG: ('TAG', (), _act_tag, (NEED_PREFIX,), ),
+ FACET: ('FACET', ('type', ), _act_facet, (NEED_PREFIX, NEED_SLOT,), ),
}
if __name__ == '__main__':
diff --git a/secore/fieldmappings.py b/secore/fieldmappings.py
index 3838ce5..7347448 100644
--- a/secore/fieldmappings.py
+++ b/secore/fieldmappings.py
@@ -76,6 +76,17 @@ class FieldMappings(object):
num = num // 26
return 'X' + ''.join(res)
+ def get_fieldname_from_prefix(self, prefix):
+ """Get a fieldname from a prefix.
+
+ If the prefix is not found, return None.
+
+ """
+ for key, val in self._prefixes.iteritems():
+ if val == prefix:
+ return key
+ return None
+
def get_prefix(self, fieldname):
"""Get the prefix used for a given field name.
diff --git a/secore/indexerconnection.py b/secore/indexerconnection.py
index be82319..87fdd35 100644
--- a/secore/indexerconnection.py
+++ b/secore/indexerconnection.py
@@ -225,6 +225,56 @@ class IndexerConnection(object):
xapdoc = document.prepare()
self._index.replace_document('Q' + id, xapdoc)
+ def _make_synonym_key(self, original, field):
+ """Make a synonym key (ie, the term or group of terms to store in
+ xapian).
+
+ """
+ if field is not None:
+ prefix = self._field_mappings.get_prefix(field)
+ else:
+ prefix = ''
+ original = original.lower()
+ # Add the prefix to the start of each word.
+ return ' '.join((prefix + word for word in original.split(' ')))
+
+ def add_synonym(self, original, synonym, field=None):
+ """Add a synonym to the index.
+
+ - `original` is the word or words which will be synonym expanded in
+ searches (if multiple words are specified, each word should be
+ separated by a single space).
+ - `synonym` is a synonym for `original`.
+ - `field` is the field which this synonym is specific to. If no field
+ is specified, the synonym will be used for searches which are not
+ specific to any particular field.
+
+ """
+ key = self._make_synonym_key(original, field)
+ self._index.add_synonym(key, synonym.lower())
+
+ def remove_synonym(self, original, synonym, field=None):
+ """Remove a synonym from the index.
+
+ - `field` is the field which this synonym is specific to. If no field
+ is specified, the synonym will be used for searches which are not
+ specific to any particular field.
+
+ """
+ key = self._make_synonym_key(original, field)
+ self._index.remove_synonym(key, synonym)
+
+ def clear_synonyms(self, original, field=None):
+ """Remove all synonyms for a word (or phrase).
+
+ - `field` is the field which this synonym is specific to. If no field
+ is specified, the synonym will be used for searches which are not
+ specific to any particular field.
+
+ """
+ key = self._make_synonym_key(original, field)
+ self._index.clear_synonyms(key)
+
def delete(self, id):
"""Delete a document from the search engine index.
@@ -332,6 +382,32 @@ class IndexerConnection(object):
result._doc = self._index.get_document(plitem.docid)
return result
+ def iter_synonyms(self, prefix=""):
+ """Get an iterator over the synonyms.
+
+ - `prefix`: if specified, only synonym keys with this prefix will be
+ returned.
+
+ The iterator returns 2-tuples, in which the first item is the key (ie,
+ a 2-tuple holding the term or terms which will be synonym expanded,
+ followed by the fieldname specified (or None if no fieldname)), and the
+ second item is a tuple of strings holding the synonyms for the first
+ item.
+
+ These return values are suitable for the dict() builtin, so you can
+ write things like:
+
+ >>> conn = IndexerConnection('foo')
+ >>> conn.add_synonym('foo', 'bar')
+ >>> conn.add_synonym('foo bar', 'baz')
+ >>> conn.add_synonym('foo bar', 'foo baz')
+ >>> dict(conn.iter_synonyms())
+ {('foo', None): ('bar',), ('foo bar', None): ('baz', 'foo baz')}
+
+ """
+ return SynonymIter(self._index, self._field_mappings, prefix)
+
+
class PrefixedTermIter(object):
"""Iterate through all the terms with a given prefix.
@@ -340,7 +416,7 @@ class PrefixedTermIter(object):
"""Initialise the prefixed term iterator.
- `prefix` is the prefix to return terms for.
- - `termiter` is a xapian TermIterator, which should be at it's start.
+ - `termiter` is a xapian TermIterator, which should be at its start.
"""
@@ -364,7 +440,6 @@ class PrefixedTermIter(object):
def next(self):
"""Get the next term with the specified prefix.
-
"""
if not self._started:
term = self._termiter.skip_to(self._prefix).term
@@ -375,6 +450,45 @@ class PrefixedTermIter(object):
raise StopIteration
return term[self._prefixlen:]
+
+class SynonymIter(object):
+ """Iterate through a list of synonyms.
+
+ """
+ def __init__(self, index, field_mappings, prefix):
+ """Initialise the synonym iterator.
+
+ - `index` is the index to get the synonyms from.
+ - `field_mappings` is the FieldMappings object for the iterator.
+ - `prefix` is the prefix to restrict the returned synonyms to.
+
+ """
+ self._index = index
+ self._field_mappings = field_mappings
+ self._syniter = self._index.synonym_keys(prefix)
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ """Get the next synonym.
+
+ """
+ synkey = self._syniter.next()
+ pos = 0
+ for char in synkey:
+ if char.isupper(): pos += 1
+ else: break
+ if pos == 0:
+ fieldname = None
+ terms = synkey
+ else:
+ prefix = synkey[:pos]
+ fieldname = self._field_mappings.get_fieldname_from_prefix(prefix)
+ terms = ' '.join((term[pos:] for term in synkey.split(' ')))
+ synval = tuple(self._index.synonyms(synkey))
+ return ((terms, fieldname), synval)
+
if __name__ == '__main__':
import doctest, sys
doctest.testmod (sys.modules[__name__])
diff --git a/secore/marshall.py b/secore/marshall.py
index ebcc71d..8f1caee 100644
--- a/secore/marshall.py
+++ b/secore/marshall.py
@@ -21,50 +21,14 @@ r"""marshall.py: Marshal values into strings
__docformat__ = "restructuredtext en"
import math
-
-def _long_to_base256_array(value, length, flip):
- result = []
- for i in xrange(length):
- n = value % 256
- if flip: n = 255 - n
- result.insert(0, chr(n))
- value /= 256
- return result
+import xapian
def float_to_string(value):
"""Marshall a floating point number to a string which sorts in the
appropriate manner.
"""
- mantissa, exponent = math.frexp(value)
- sign = '1'
- if mantissa < 0:
- mantissa = -mantissa
- sign = '0'
-
- # IEEE representation of doubles uses 11 bits for the exponent, with a bias
- # of 1023. There's then another 52 bits in the mantissa, so we need to
- # add 1075 to be sure that the exponent won't be negative.
- # Even then, we check that the exponent isn't negative, and consider the
- # value to be equal to zero if it is.
- exponent += 1075
- if exponent < 0: # Note - this can't happen on most architectures #pragma: no cover
- exponent = 0
- mantissa = 0
- elif mantissa == 0:
- exponent = 0
-
- # IEEE representation of doubles uses 52 bits for the mantissa. Convert it
- # to a 7 character string, and convert the exponent to a 2 character
- # string.
-
- mantissa = long(mantissa * (2**52))
-
- digits = [sign]
- digits.extend(_long_to_base256_array(exponent, 2, sign == '0'))
- digits.extend(_long_to_base256_array(mantissa, 7, sign == '0'))
-
- return ''.join(digits)
+ return xapian.sortable_serialise(value)
def date_to_string(date):
"""Marshall a date to a string which sorts in the appropriate manner.
diff --git a/secore/searchconnection.py b/secore/searchconnection.py
index 79fa509..f7caeab 100644
--- a/secore/searchconnection.py
+++ b/secore/searchconnection.py
@@ -20,14 +20,16 @@ r"""searchconnection.py: A connection to the search engine for searching.
"""
__docformat__ = "restructuredtext en"
+import os as _os
+import cPickle as _cPickle
+
import xapian as _xapian
from datastructures import *
from fieldactions import *
import fieldmappings as _fieldmappings
import highlight as _highlight
import errors as _errors
-import os as _os
-import cPickle as _cPickle
+import indexerconnection as _indexerconnection
class SearchResult(ProcessedDocument):
"""A result from a search.
@@ -42,7 +44,10 @@ class SearchResult(ProcessedDocument):
"""Get the language that should be used for a given field.
"""
- actions = self._results._conn._field_actions[field]._actions
+ try:
+ actions = self._results._conn._field_actions[field]._actions
+ except KeyError:
+ actions = {}
for action, kwargslist in actions.iteritems():
if action == FieldActions.INDEX_FREETEXT:
for kwargs in kwargslist:
@@ -118,20 +123,24 @@ class SearchResultIter(object):
def next(self):
msetitem = self._iter.next()
- return SearchResult(msetitem,
- self._results)
+ return SearchResult(msetitem, self._results)
class SearchResults(object):
"""A set of results of a search.
"""
- def __init__(self, conn, enq, query, mset, fieldmappings):
+ def __init__(self, conn, enq, query, mset, fieldmappings, tagspy,
+ facetspy, facetfields):
self._conn = conn
self._enq = enq
self._query = query
self._mset = mset
self._fieldmappings = fieldmappings
+ self._tagspy = tagspy
+ self._facetspy = facetspy
+ self._facetfields = facetfields
+ self._numeric_ranges_built = {}
def __repr__(self):
return ("<SearchResults(startrank=%d, "
@@ -225,12 +234,106 @@ class SearchResults(object):
"""
return SearchResultIter(self)
+ def get_top_tags(self, field, maxtags):
+ """Get the most frequent tags in a given field.
+
+ - `field` - the field to get tags for. This must have been specified
+ in the "gettags" argument of the search() call.
+ - `maxtags` - the maximum number of tags to return.
+
+ Returns a sequence of 2-item tuples, in which the first item in the
+ tuple is the tag, and the second is the frequency of the tag in the
+ matches seen (as an integer).
+
+ """
+ if self._tagspy is None:
+ raise _errors.SearchError("Field %r was not specified for getting tags" % field)
+ try:
+ prefix = self._conn._field_mappings.get_prefix(field)
+ except KeyError:
+ raise _errors.SearchError("Field %r was not indexed for tagging" % field)
+ return self._tagspy.get_top_terms(prefix, maxtags)
+
+ def get_suggested_facets(self, maxfacets=5, desired_num_of_categories=7):
+ """Get a suggested set of facets, to present to the user.
+
+ This returns a list, in descending order of the usefulness of the
+ facet, in which each item is a tuple holding:
+
+ - fieldname of facet.
+ - sequence of 2-tuples holding the suggested values or ranges for that
+ field:
+
+ For facets of type 'string', the first item in the 2-tuple will
+ simply be the string supplied when the facet value was added to its
+ document. For facets of type 'float', it will be a 2-tuple, holding
+ floats giving the start and end of the suggested value range.
+
+ The second item in the 2-tuple will be the frequency of the facet
+ value or range in the result set.
+
+ """
+ if self._facetspy is None:
+ return []
+ scores = []
+ facettypes = {}
+ for field, slot, kwargslist in self._facetfields:
+ type = None
+ for kwargs in kwargslist:
+ type = kwargs.get('type', None)
+ if type is not None: break
+ if type is None: type = 'string'
+
+ if type == 'float':
+ if field not in self._numeric_ranges_built:
+ field, self._facetspy.build_numeric_ranges(slot, desired_num_of_categories)
+ self._numeric_ranges_built[field] = None
+ facettypes[field] = type
+ score = self._facetspy.score_categorisation(slot,
+ desired_num_of_categories)
+ scores.append((score, field, slot))
+ scores.sort()
+
+ result = []
+ for score, field, slot in scores:
+ values = self._facetspy.get_values_as_dict(slot)
+ if len(values) <= 1:
+ continue
+ newvalues = []
+ if facettypes[field] == 'float':
+ # Convert numbers to python numbers, and number ranges to a
+ # python tuple of two numbers.
+ for value, frequency in values.iteritems():
+ if len(value) <= 9:
+ value1 = _xapian.sortable_unserialise(value)
+ value2 = value1
+ else:
+ value1 = _xapian.sortable_unserialise(value[:9])
+ value2 = _xapian.sortable_unserialise(value[9:])
+ newvalues.append(((value1, value2), frequency))
+ else:
+ for value, frequency in values.iteritems():
+ newvalues.append((value, frequency))
+
+ newvalues.sort()
+ result.append((field, newvalues))
+ if len(result) >= maxfacets:
+ break
+ return result
+
+
class SearchConnection(object):
"""A connection to the search engine for searching.
The connection will access a view of the database.
"""
+ _qp_flags_std = (_xapian.QueryParser.FLAG_PHRASE |
+ _xapian.QueryParser.FLAG_BOOLEAN |
+ _xapian.QueryParser.FLAG_LOVEHATE |
+ _xapian.QueryParser.FLAG_AUTO_SYNONYMS |
+ _xapian.QueryParser.FLAG_AUTO_MULTIWORD_SYNONYMS)
+ _qp_flags_nobool = (_qp_flags_std | _xapian.QueryParser.FLAG_BOOLEAN) ^ _xapian.QueryParser.FLAG_BOOLEAN
def __init__(self, indexpath):
"""Create a new connection to the index for searching.
@@ -252,7 +355,10 @@ class SearchConnection(object):
"""Get the sort type that should be used for a given field.
"""
- actions = self._field_actions[field]._actions
+ try:
+ actions = self._field_actions[field]._actions
+ except KeyError:
+ actions = {}
for action, kwargslist in actions.iteritems():
if action == FieldActions.SORT_AND_COLLAPSE:
for kwargs in kwargslist:
@@ -266,6 +372,7 @@ class SearchConnection(object):
# class. Move it to a shared location.
config_file = _os.path.join(self._indexpath, 'config')
if not _os.path.exists(config_file):
+ self._field_actions = {}
self._field_mappings = _fieldmappings.FieldMappings()
return
fd = open(config_file)
@@ -368,21 +475,35 @@ class SearchConnection(object):
raise _errors.SearchError("SearchConnection has been closed")
return _xapian.Query(operator, list(queries))
- def query_filter(self, query, filter):
+ def query_filter(self, query, filter, exclude=False):
"""Filter a query with another query.
- Documents will only match the resulting query if they match both
- queries, but will be weighted according to only the first query.
+ If exclude is False (or not specified), documents will only match the
+ resulting query if they match the both the first and second query: the
+ results of the first query are "filtered" to only include those which
+ also match the second query.
+
+ If exclude is True, documents will only match the resulting query if
+ they match the first query, but not the second query: the results of
+ the first query are "filtered" to only include those which do not match
+ the second query.
+
+ Documents will always be weighted according to only the first query.
- `query`: The query to filter.
- `filter`: The filter to apply to the query.
+ - `exclude`: If True, the sense of the filter is reversed - only
+ documents which do not match the second query will be returned.
"""
if self._index is None:
raise _errors.SearchError("SearchConnection has been closed")
if not isinstance(filter, _xapian.Query):
raise _errors.SearchError("Filter must be a Xapian Query object")
- return _xapian.Query(_xapian.Query.OP_FILTER, query, filter)
+ if exclude:
+ return _xapian.Query(_xapian.Query.OP_AND_NOT, query, filter)
+ else:
+ return _xapian.Query(_xapian.Query.OP_FILTER, query, filter)
def query_range(self, field, begin, end):
"""Create a query for a range search.
@@ -407,9 +528,61 @@ class SearchConnection(object):
begin = fn(field, begin)
end = fn(field, end)
- slot = self._field_mappings.get_slot(field)
+ try:
+ slot = self._field_mappings.get_slot(field)
+ except KeyError:
+ return _xapian.Query()
return _xapian.Query(_xapian.Query.OP_VALUE_RANGE, slot, begin, end)
+ def query_facet(self, field, val):
+ """Create a query for a facet value.
+
+ This creates a query which matches only those documents which have a
+ facet value in the specified range.
+
+ For a numeric range facet, val should be a tuple holding the start and
+ end of the range. For other facets, val should be the value to look
+ for.
+
+ The start and end values are both inclusive - any documents with a
+ value equal to start or end will be returned (unless end is less than
+ start, in which case no documents will be returned).
+
+ """
+ if self._index is None:
+ raise _errors.SearchError("SearchConnection has been closed")
+
+ try:
+ actions = self._field_actions[field]._actions
+ except KeyError:
+ actions = {}
+ facettype = None
+ for action, kwargslist in actions.iteritems():
+ if action == FieldActions.FACET:
+ for kwargs in kwargslist:
+ facettype = kwargs.get('type', None)
+ if facettype is not None:
+ break
+ if facettype is not None:
+ break
+
+ if facettype == 'float':
+ assert(len(val) == 2)
+ try:
+ slot = self._field_mappings.get_slot(field)
+ except KeyError:
+ return _xapian.Query()
+ marshaller = SortableMarshaller(False)
+ fn = marshaller.get_marshall_function(field, sorttype)
+ begin = fn(field, val[0])
+ end = fn(field, val[1])
+ return _xapian.Query(_xapian.Query.OP_VALUE_RANGE, slot, begin, end)
+ else:
+ assert(facettype == 'string' or facettype is None)
+ prefix = self._field_mappings.get_prefix(field)
+ return _xapian.Query(prefix + val.lower())
+
+
def _prepare_queryparser(self, allow, deny, default_op):
"""Prepare (and return) a query parser using the specified fields and
operator.
@@ -429,7 +602,10 @@ class SearchConnection(object):
allow = [key for key in allow if key not in deny]
for field in allow:
- actions = self._field_actions[field]._actions
+ try:
+ actions = self._field_actions[field]._actions
+ except KeyError:
+ actions = {}
for action, kwargslist in actions.iteritems():
if action == FieldActions.INDEX_EXACT:
# FIXME - need patched version of xapian to add exact prefixes
@@ -459,8 +635,11 @@ class SearchConnection(object):
Only one of `allow` and `deny` may be specified.
- If any of the entries in `allow` or `deny` are not present in the
- configuration for the database, an exception will be raised.
+ If any of the entries in `allow` are not present in the configuration
+ for the database, or are not specified for indexing (either as
+ INDEX_EXACT or INDEX_FREETEXT), they will be ignored. If any of the
+ entries in `deny` are not present in the configuration for the
+ database, they will be ignored.
Returns a Query object, which may be passed to the search() method, or
combined with other queries.
@@ -468,11 +647,11 @@ class SearchConnection(object):
"""
qp = self._prepare_queryparser(allow, deny, default_op)
try:
- return qp.parse_query(string)
+ return qp.parse_query(string, self._qp_flags_std)
except _xapian.QueryParserError, e:
# If we got a parse error, retry without boolean operators (since
# these are the usual cause of the parse error).
- return qp.parse_query(string, 0)
+ return qp.parse_query(string, self._qp_flags_nobool)
def query_field(self, field, value, default_op=OP_AND):
"""A query for a single field.
@@ -487,7 +666,9 @@ class SearchConnection(object):
# need to check on field type, and stem / split as appropriate
for action, kwargslist in actions.iteritems():
- if action == FieldActions.INDEX_EXACT:
+ if action in (FieldActions.INDEX_EXACT,
+ FieldActions.TAG,
+ FieldActions.FACET,):
prefix = self._field_mappings.get_prefix(field)
if len(value) > 0:
chval = ord(value[0])
@@ -505,9 +686,7 @@ class SearchConnection(object):
qp.set_stemming_strategy(qp.STEM_SOME)
except KeyError:
pass
- return qp.parse_query(value,
- qp.FLAG_PHRASE | qp.FLAG_BOOLEAN | qp.FLAG_LOVEHATE,
- prefix)
+ return qp.parse_query(value, self._qp_flags_std, prefix)
return _xapian.Query()
@@ -528,12 +707,15 @@ class SearchConnection(object):
Only one of `allow` and `deny` may be specified.
- If any of the entries in `allow` or `deny` are not present in the
- configuration for the database, an exception will be raised.
+ If any of the entries in `allow` are not present in the configuration
+ for the database, or are not specified for indexing (either as
+ INDEX_EXACT or INDEX_FREETEXT), they will be ignored. If any of the
+ entries in `deny` are not present in the configuration for the
+ database, they will be ignored.
"""
qp = self._prepare_queryparser(allow, deny, self.OP_AND)
- qp.parse_query(string, qp.FLAG_PHRASE|qp.FLAG_BOOLEAN|qp.FLAG_LOVEHATE|qp.FLAG_SPELLING_CORRECTION)
+ qp.parse_query(string, self._qp_flags_std | qp.FLAG_SPELLING_CORRECTION)
corrected = qp.get_corrected_query_string()
if len(corrected) == 0:
if isinstance(string, unicode):
@@ -544,7 +726,9 @@ class SearchConnection(object):
return corrected
def search(self, query, startrank, endrank,
- checkatleast=0, sortby=None, collapse=None):
+ checkatleast=0, sortby=None, collapse=None,
+ gettags=None,
+ getfacets=None, allowfacets=None, denyfacets=None):
"""Perform a search, for documents matching a query.
- `query` is the query to perform.
@@ -556,7 +740,10 @@ class SearchConnection(object):
be returned.
- `checkatleast` is the minimum number of results to check for: the
estimate of the total number of matches will always be exact if
- the number of matches is less than `checkatleast`.
+ the number of matches is less than `checkatleast`. A value of ``-1``
+ can be specified for the checkatleast parameter - this has the
+ special meaning of "check all matches", and is equivalent to passing
+ the result of get_doccount().
- `sortby` is the name of a field to sort by. It may be preceded by a
'+' or a '-' to indicate ascending or descending order
(respectively). If the first character is neither '+' or '-', the
@@ -564,10 +751,23 @@ class SearchConnection(object):
- `collapse` is the name of a field to collapse the result documents
on. If this is specified, there will be at most one result in the
result set for each value of the field.
+ - `gettags` is the name of a field to count tag occurrences in, or a
+ list of fields to do so.
+ - `getfacets` is a boolean - if True, the matching documents will be
+ examined to build up a list of the facet values contained in them.
+ - `allowfacets` is a list of the fieldnames of facets to consider.
+ - `denyfacets` is a list of fieldnames of facets which will not be
+ considered.
+
+ If neither 'allowfacets' or 'denyfacets' is specified, all fields
+ holding facets will be considered.
"""
if self._index is None:
raise _errors.SearchError("SearchConnection has been closed")
+ if checkatleast == -1:
+ checkatleast = self._index.get_doccount()
+
enq = _xapian.Enquire(self._index)
enq.set_query(query)
@@ -602,16 +802,103 @@ class SearchConnection(object):
# there are more matches.
checkatleast = max(checkatleast, endrank + 1)
+ # Build the matchspy.
+ matchspies = []
+
+ # First, add a matchspy for any gettags fields
+ if isinstance(gettags, basestring):
+ if len(gettags) != 0:
+ gettags = [gettags]
+ tagspy = None
+ if gettags is not None and len(gettags) != 0:
+ tagspy = _xapian.TermCountMatchSpy()
+ for field in gettags:
+ try:
+ prefix = self._field_mappings.get_prefix(field)
+ tagspy.add_prefix(prefix)
+ except KeyError:
+ raise _errors.SearchError("Field %r was not indexed for tagging" % field)
+ matchspies.append(tagspy)
+
+
+ # add a matchspy for facet selection here.
+ facetspy = None
+ facetfields = []
+ if getfacets:
+ if allowfacets is not None and denyfacets is not None:
+ raise _errors.SearchError("Cannot specify both `allowfacets` and `denyfacets`")
+ if allowfacets is None:
+ allowfacets = [key for key in self._field_actions]
+ if denyfacets is not None:
+ allowfacets = [key for key in allowfacets if key not in denyfacets]
+
+ for field in allowfacets:
+ try:
+ actions = self._field_actions[field]._actions
+ except KeyError:
+ actions = {}
+ for action, kwargslist in actions.iteritems():
+ if action == FieldActions.FACET:
+ slot = self._field_mappings.get_slot(field)
+ if facetspy is None:
+ facetspy = _xapian.CategorySelectMatchSpy()
+ facetspy.add_slot(slot)
+ facetfields.append((field, slot,
+ kwargslist))
+ if facetspy is not None:
+ matchspies.append(facetspy)
+
+
+ # Finally, build a single matchspy to pass to get_mset().
+ if len(matchspies) == 0:
+ matchspy = None
+ elif len(matchspies) == 1:
+ matchspy = matchspies[0]
+ else:
+ matchspy = _xapian.MultipleMatchDecider()
+ for spy in matchspies:
+ matchspy.append(spy)
+
enq.set_docid_order(enq.DONT_CARE)
# Repeat the search until we don't get a DatabaseModifiedError
while True:
try:
- mset = enq.get_mset(startrank, maxitems, checkatleast)
+ mset = enq.get_mset(startrank, maxitems, checkatleast, None,
+ None, matchspy)
break
except _xapian.DatabaseModifiedError, e:
self.reopen()
- return SearchResults(self, enq, query, mset, self._field_mappings)
+ return SearchResults(self, enq, query, mset, self._field_mappings,
+ tagspy, facetspy, facetfields)
+
+ def iter_synonyms(self, prefix=""):
+ """Get an iterator over the synonyms.
+
+ - `prefix`: if specified, only synonym keys with this prefix will be
+ returned.
+
+ The iterator returns 2-tuples, in which the first item is the key (ie,
+ a 2-tuple holding the term or terms which will be synonym expanded,
+ followed by the fieldname specified (or None if no fieldname)), and the
+ second item is a tuple of strings holding the synonyms for the first
+ item.
+
+ These return values are suitable for the dict() builtin, so you can
+ write things like:
+
+ >>> conn = _indexerconnection.IndexerConnection('foo')
+ >>> conn.add_synonym('foo', 'bar')
+ >>> conn.add_synonym('foo bar', 'baz')
+ >>> conn.add_synonym('foo bar', 'foo baz')
+ >>> conn.flush()
+ >>> conn = SearchConnection('foo')
+ >>> dict(conn.iter_synonyms())
+ {('foo', None): ('bar',), ('foo bar', None): ('baz', 'foo baz')}
+
+ """
+ return _indexerconnection.SynonymIter(self._index, self._field_mappings, prefix)
+
if __name__ == '__main__':
import doctest, sys