diff options
Diffstat (limited to 'secore/searchconnection.py')
-rw-r--r-- | secore/searchconnection.py | 343 |
1 files changed, 28 insertions, 315 deletions
diff --git a/secore/searchconnection.py b/secore/searchconnection.py index f7caeab..79fa509 100644 --- a/secore/searchconnection.py +++ b/secore/searchconnection.py @@ -20,16 +20,14 @@ r"""searchconnection.py: A connection to the search engine for searching. """ __docformat__ = "restructuredtext en" -import os as _os -import cPickle as _cPickle - import xapian as _xapian from datastructures import * from fieldactions import * import fieldmappings as _fieldmappings import highlight as _highlight import errors as _errors -import indexerconnection as _indexerconnection +import os as _os +import cPickle as _cPickle class SearchResult(ProcessedDocument): """A result from a search. @@ -44,10 +42,7 @@ class SearchResult(ProcessedDocument): """Get the language that should be used for a given field. """ - try: - actions = self._results._conn._field_actions[field]._actions - except KeyError: - actions = {} + actions = self._results._conn._field_actions[field]._actions for action, kwargslist in actions.iteritems(): if action == FieldActions.INDEX_FREETEXT: for kwargs in kwargslist: @@ -123,24 +118,20 @@ class SearchResultIter(object): def next(self): msetitem = self._iter.next() - return SearchResult(msetitem, self._results) + return SearchResult(msetitem, + self._results) class SearchResults(object): """A set of results of a search. """ - def __init__(self, conn, enq, query, mset, fieldmappings, tagspy, - facetspy, facetfields): + def __init__(self, conn, enq, query, mset, fieldmappings): self._conn = conn self._enq = enq self._query = query self._mset = mset self._fieldmappings = fieldmappings - self._tagspy = tagspy - self._facetspy = facetspy - self._facetfields = facetfields - self._numeric_ranges_built = {} def __repr__(self): return ("<SearchResults(startrank=%d, " @@ -234,106 +225,12 @@ class SearchResults(object): """ return SearchResultIter(self) - def get_top_tags(self, field, maxtags): - """Get the most frequent tags in a given field. - - - `field` - the field to get tags for. This must have been specified - in the "gettags" argument of the search() call. - - `maxtags` - the maximum number of tags to return. - - Returns a sequence of 2-item tuples, in which the first item in the - tuple is the tag, and the second is the frequency of the tag in the - matches seen (as an integer). - - """ - if self._tagspy is None: - raise _errors.SearchError("Field %r was not specified for getting tags" % field) - try: - prefix = self._conn._field_mappings.get_prefix(field) - except KeyError: - raise _errors.SearchError("Field %r was not indexed for tagging" % field) - return self._tagspy.get_top_terms(prefix, maxtags) - - def get_suggested_facets(self, maxfacets=5, desired_num_of_categories=7): - """Get a suggested set of facets, to present to the user. - - This returns a list, in descending order of the usefulness of the - facet, in which each item is a tuple holding: - - - fieldname of facet. - - sequence of 2-tuples holding the suggested values or ranges for that - field: - - For facets of type 'string', the first item in the 2-tuple will - simply be the string supplied when the facet value was added to its - document. For facets of type 'float', it will be a 2-tuple, holding - floats giving the start and end of the suggested value range. - - The second item in the 2-tuple will be the frequency of the facet - value or range in the result set. - - """ - if self._facetspy is None: - return [] - scores = [] - facettypes = {} - for field, slot, kwargslist in self._facetfields: - type = None - for kwargs in kwargslist: - type = kwargs.get('type', None) - if type is not None: break - if type is None: type = 'string' - - if type == 'float': - if field not in self._numeric_ranges_built: - field, self._facetspy.build_numeric_ranges(slot, desired_num_of_categories) - self._numeric_ranges_built[field] = None - facettypes[field] = type - score = self._facetspy.score_categorisation(slot, - desired_num_of_categories) - scores.append((score, field, slot)) - scores.sort() - - result = [] - for score, field, slot in scores: - values = self._facetspy.get_values_as_dict(slot) - if len(values) <= 1: - continue - newvalues = [] - if facettypes[field] == 'float': - # Convert numbers to python numbers, and number ranges to a - # python tuple of two numbers. - for value, frequency in values.iteritems(): - if len(value) <= 9: - value1 = _xapian.sortable_unserialise(value) - value2 = value1 - else: - value1 = _xapian.sortable_unserialise(value[:9]) - value2 = _xapian.sortable_unserialise(value[9:]) - newvalues.append(((value1, value2), frequency)) - else: - for value, frequency in values.iteritems(): - newvalues.append((value, frequency)) - - newvalues.sort() - result.append((field, newvalues)) - if len(result) >= maxfacets: - break - return result - - class SearchConnection(object): """A connection to the search engine for searching. The connection will access a view of the database. """ - _qp_flags_std = (_xapian.QueryParser.FLAG_PHRASE | - _xapian.QueryParser.FLAG_BOOLEAN | - _xapian.QueryParser.FLAG_LOVEHATE | - _xapian.QueryParser.FLAG_AUTO_SYNONYMS | - _xapian.QueryParser.FLAG_AUTO_MULTIWORD_SYNONYMS) - _qp_flags_nobool = (_qp_flags_std | _xapian.QueryParser.FLAG_BOOLEAN) ^ _xapian.QueryParser.FLAG_BOOLEAN def __init__(self, indexpath): """Create a new connection to the index for searching. @@ -355,10 +252,7 @@ class SearchConnection(object): """Get the sort type that should be used for a given field. """ - try: - actions = self._field_actions[field]._actions - except KeyError: - actions = {} + actions = self._field_actions[field]._actions for action, kwargslist in actions.iteritems(): if action == FieldActions.SORT_AND_COLLAPSE: for kwargs in kwargslist: @@ -372,7 +266,6 @@ class SearchConnection(object): # class. Move it to a shared location. config_file = _os.path.join(self._indexpath, 'config') if not _os.path.exists(config_file): - self._field_actions = {} self._field_mappings = _fieldmappings.FieldMappings() return fd = open(config_file) @@ -475,35 +368,21 @@ class SearchConnection(object): raise _errors.SearchError("SearchConnection has been closed") return _xapian.Query(operator, list(queries)) - def query_filter(self, query, filter, exclude=False): + def query_filter(self, query, filter): """Filter a query with another query. - If exclude is False (or not specified), documents will only match the - resulting query if they match the both the first and second query: the - results of the first query are "filtered" to only include those which - also match the second query. - - If exclude is True, documents will only match the resulting query if - they match the first query, but not the second query: the results of - the first query are "filtered" to only include those which do not match - the second query. - - Documents will always be weighted according to only the first query. + Documents will only match the resulting query if they match both + queries, but will be weighted according to only the first query. - `query`: The query to filter. - `filter`: The filter to apply to the query. - - `exclude`: If True, the sense of the filter is reversed - only - documents which do not match the second query will be returned. """ if self._index is None: raise _errors.SearchError("SearchConnection has been closed") if not isinstance(filter, _xapian.Query): raise _errors.SearchError("Filter must be a Xapian Query object") - if exclude: - return _xapian.Query(_xapian.Query.OP_AND_NOT, query, filter) - else: - return _xapian.Query(_xapian.Query.OP_FILTER, query, filter) + return _xapian.Query(_xapian.Query.OP_FILTER, query, filter) def query_range(self, field, begin, end): """Create a query for a range search. @@ -528,61 +407,9 @@ class SearchConnection(object): begin = fn(field, begin) end = fn(field, end) - try: - slot = self._field_mappings.get_slot(field) - except KeyError: - return _xapian.Query() + slot = self._field_mappings.get_slot(field) return _xapian.Query(_xapian.Query.OP_VALUE_RANGE, slot, begin, end) - def query_facet(self, field, val): - """Create a query for a facet value. - - This creates a query which matches only those documents which have a - facet value in the specified range. - - For a numeric range facet, val should be a tuple holding the start and - end of the range. For other facets, val should be the value to look - for. - - The start and end values are both inclusive - any documents with a - value equal to start or end will be returned (unless end is less than - start, in which case no documents will be returned). - - """ - if self._index is None: - raise _errors.SearchError("SearchConnection has been closed") - - try: - actions = self._field_actions[field]._actions - except KeyError: - actions = {} - facettype = None - for action, kwargslist in actions.iteritems(): - if action == FieldActions.FACET: - for kwargs in kwargslist: - facettype = kwargs.get('type', None) - if facettype is not None: - break - if facettype is not None: - break - - if facettype == 'float': - assert(len(val) == 2) - try: - slot = self._field_mappings.get_slot(field) - except KeyError: - return _xapian.Query() - marshaller = SortableMarshaller(False) - fn = marshaller.get_marshall_function(field, sorttype) - begin = fn(field, val[0]) - end = fn(field, val[1]) - return _xapian.Query(_xapian.Query.OP_VALUE_RANGE, slot, begin, end) - else: - assert(facettype == 'string' or facettype is None) - prefix = self._field_mappings.get_prefix(field) - return _xapian.Query(prefix + val.lower()) - - def _prepare_queryparser(self, allow, deny, default_op): """Prepare (and return) a query parser using the specified fields and operator. @@ -602,10 +429,7 @@ class SearchConnection(object): allow = [key for key in allow if key not in deny] for field in allow: - try: - actions = self._field_actions[field]._actions - except KeyError: - actions = {} + actions = self._field_actions[field]._actions for action, kwargslist in actions.iteritems(): if action == FieldActions.INDEX_EXACT: # FIXME - need patched version of xapian to add exact prefixes @@ -635,11 +459,8 @@ class SearchConnection(object): Only one of `allow` and `deny` may be specified. - If any of the entries in `allow` are not present in the configuration - for the database, or are not specified for indexing (either as - INDEX_EXACT or INDEX_FREETEXT), they will be ignored. If any of the - entries in `deny` are not present in the configuration for the - database, they will be ignored. + If any of the entries in `allow` or `deny` are not present in the + configuration for the database, an exception will be raised. Returns a Query object, which may be passed to the search() method, or combined with other queries. @@ -647,11 +468,11 @@ class SearchConnection(object): """ qp = self._prepare_queryparser(allow, deny, default_op) try: - return qp.parse_query(string, self._qp_flags_std) + return qp.parse_query(string) except _xapian.QueryParserError, e: # If we got a parse error, retry without boolean operators (since # these are the usual cause of the parse error). - return qp.parse_query(string, self._qp_flags_nobool) + return qp.parse_query(string, 0) def query_field(self, field, value, default_op=OP_AND): """A query for a single field. @@ -666,9 +487,7 @@ class SearchConnection(object): # need to check on field type, and stem / split as appropriate for action, kwargslist in actions.iteritems(): - if action in (FieldActions.INDEX_EXACT, - FieldActions.TAG, - FieldActions.FACET,): + if action == FieldActions.INDEX_EXACT: prefix = self._field_mappings.get_prefix(field) if len(value) > 0: chval = ord(value[0]) @@ -686,7 +505,9 @@ class SearchConnection(object): qp.set_stemming_strategy(qp.STEM_SOME) except KeyError: pass - return qp.parse_query(value, self._qp_flags_std, prefix) + return qp.parse_query(value, + qp.FLAG_PHRASE | qp.FLAG_BOOLEAN | qp.FLAG_LOVEHATE, + prefix) return _xapian.Query() @@ -707,15 +528,12 @@ class SearchConnection(object): Only one of `allow` and `deny` may be specified. - If any of the entries in `allow` are not present in the configuration - for the database, or are not specified for indexing (either as - INDEX_EXACT or INDEX_FREETEXT), they will be ignored. If any of the - entries in `deny` are not present in the configuration for the - database, they will be ignored. + If any of the entries in `allow` or `deny` are not present in the + configuration for the database, an exception will be raised. """ qp = self._prepare_queryparser(allow, deny, self.OP_AND) - qp.parse_query(string, self._qp_flags_std | qp.FLAG_SPELLING_CORRECTION) + qp.parse_query(string, qp.FLAG_PHRASE|qp.FLAG_BOOLEAN|qp.FLAG_LOVEHATE|qp.FLAG_SPELLING_CORRECTION) corrected = qp.get_corrected_query_string() if len(corrected) == 0: if isinstance(string, unicode): @@ -726,9 +544,7 @@ class SearchConnection(object): return corrected def search(self, query, startrank, endrank, - checkatleast=0, sortby=None, collapse=None, - gettags=None, - getfacets=None, allowfacets=None, denyfacets=None): + checkatleast=0, sortby=None, collapse=None): """Perform a search, for documents matching a query. - `query` is the query to perform. @@ -740,10 +556,7 @@ class SearchConnection(object): be returned. - `checkatleast` is the minimum number of results to check for: the estimate of the total number of matches will always be exact if - the number of matches is less than `checkatleast`. A value of ``-1`` - can be specified for the checkatleast parameter - this has the - special meaning of "check all matches", and is equivalent to passing - the result of get_doccount(). + the number of matches is less than `checkatleast`. - `sortby` is the name of a field to sort by. It may be preceded by a '+' or a '-' to indicate ascending or descending order (respectively). If the first character is neither '+' or '-', the @@ -751,23 +564,10 @@ class SearchConnection(object): - `collapse` is the name of a field to collapse the result documents on. If this is specified, there will be at most one result in the result set for each value of the field. - - `gettags` is the name of a field to count tag occurrences in, or a - list of fields to do so. - - `getfacets` is a boolean - if True, the matching documents will be - examined to build up a list of the facet values contained in them. - - `allowfacets` is a list of the fieldnames of facets to consider. - - `denyfacets` is a list of fieldnames of facets which will not be - considered. - - If neither 'allowfacets' or 'denyfacets' is specified, all fields - holding facets will be considered. """ if self._index is None: raise _errors.SearchError("SearchConnection has been closed") - if checkatleast == -1: - checkatleast = self._index.get_doccount() - enq = _xapian.Enquire(self._index) enq.set_query(query) @@ -802,103 +602,16 @@ class SearchConnection(object): # there are more matches. checkatleast = max(checkatleast, endrank + 1) - # Build the matchspy. - matchspies = [] - - # First, add a matchspy for any gettags fields - if isinstance(gettags, basestring): - if len(gettags) != 0: - gettags = [gettags] - tagspy = None - if gettags is not None and len(gettags) != 0: - tagspy = _xapian.TermCountMatchSpy() - for field in gettags: - try: - prefix = self._field_mappings.get_prefix(field) - tagspy.add_prefix(prefix) - except KeyError: - raise _errors.SearchError("Field %r was not indexed for tagging" % field) - matchspies.append(tagspy) - - - # add a matchspy for facet selection here. - facetspy = None - facetfields = [] - if getfacets: - if allowfacets is not None and denyfacets is not None: - raise _errors.SearchError("Cannot specify both `allowfacets` and `denyfacets`") - if allowfacets is None: - allowfacets = [key for key in self._field_actions] - if denyfacets is not None: - allowfacets = [key for key in allowfacets if key not in denyfacets] - - for field in allowfacets: - try: - actions = self._field_actions[field]._actions - except KeyError: - actions = {} - for action, kwargslist in actions.iteritems(): - if action == FieldActions.FACET: - slot = self._field_mappings.get_slot(field) - if facetspy is None: - facetspy = _xapian.CategorySelectMatchSpy() - facetspy.add_slot(slot) - facetfields.append((field, slot, - kwargslist)) - if facetspy is not None: - matchspies.append(facetspy) - - - # Finally, build a single matchspy to pass to get_mset(). - if len(matchspies) == 0: - matchspy = None - elif len(matchspies) == 1: - matchspy = matchspies[0] - else: - matchspy = _xapian.MultipleMatchDecider() - for spy in matchspies: - matchspy.append(spy) - enq.set_docid_order(enq.DONT_CARE) # Repeat the search until we don't get a DatabaseModifiedError while True: try: - mset = enq.get_mset(startrank, maxitems, checkatleast, None, - None, matchspy) + mset = enq.get_mset(startrank, maxitems, checkatleast) break except _xapian.DatabaseModifiedError, e: self.reopen() - return SearchResults(self, enq, query, mset, self._field_mappings, - tagspy, facetspy, facetfields) - - def iter_synonyms(self, prefix=""): - """Get an iterator over the synonyms. - - - `prefix`: if specified, only synonym keys with this prefix will be - returned. - - The iterator returns 2-tuples, in which the first item is the key (ie, - a 2-tuple holding the term or terms which will be synonym expanded, - followed by the fieldname specified (or None if no fieldname)), and the - second item is a tuple of strings holding the synonyms for the first - item. - - These return values are suitable for the dict() builtin, so you can - write things like: - - >>> conn = _indexerconnection.IndexerConnection('foo') - >>> conn.add_synonym('foo', 'bar') - >>> conn.add_synonym('foo bar', 'baz') - >>> conn.add_synonym('foo bar', 'foo baz') - >>> conn.flush() - >>> conn = SearchConnection('foo') - >>> dict(conn.iter_synonyms()) - {('foo', None): ('bar',), ('foo bar', None): ('baz', 'foo baz')} - - """ - return _indexerconnection.SynonymIter(self._index, self._field_mappings, prefix) - + return SearchResults(self, enq, query, mset, self._field_mappings) if __name__ == '__main__': import doctest, sys |