diff options
author | Sayamindu Dasgupta <sayamindu@gmail.com> | 2010-01-09 09:09:32 (GMT) |
---|---|---|
committer | Sayamindu Dasgupta <sayamindu@gmail.com> | 2010-01-09 09:09:32 (GMT) |
commit | 72c1991510699e6541446d9f8e139fe54b392c89 (patch) | |
tree | c900e1f346a3c98569ab7bdebb3cc073b5e26581 /translate-toolkit-1.5.1/translate/search/match.py | |
parent | b7dbad4e48e8c6779e05a56cae5a83b3c3bfec40 (diff) |
Added translate toolkit 1.5.1
Diffstat (limited to 'translate-toolkit-1.5.1/translate/search/match.py')
-rw-r--r-- | translate-toolkit-1.5.1/translate/search/match.py | 311 |
1 files changed, 311 insertions, 0 deletions
diff --git a/translate-toolkit-1.5.1/translate/search/match.py b/translate-toolkit-1.5.1/translate/search/match.py new file mode 100644 index 0000000..750fbc6 --- /dev/null +++ b/translate-toolkit-1.5.1/translate/search/match.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright 2006-2009 Zuza Software Foundation +# +# This file is part of the Translate Toolkit. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see <http://www.gnu.org/licenses/>. + +"""Class to perform translation memory matching from a store of translation units""" + +import heapq +import re + +from translate.search import lshtein +from translate.search import terminology +from translate.storage import base +from translate.storage import po +from translate.misc.multistring import multistring + + +def sourcelen(unit): + """Returns the length of the source string""" + return len(unit.source) + + +class matcher(object): + """A class that will do matching and store configuration for the matching process""" + + sort_reverse = False + + def __init__(self, store, max_candidates=10, min_similarity=75, max_length=70, comparer=None, usefuzzy=False): + """max_candidates is the maximum number of candidates that should be assembled, + min_similarity is the minimum similarity that must be attained to be included in + the result, comparer is an optional Comparer with similarity() function""" + if comparer is None: + comparer = lshtein.LevenshteinComparer(max_length) + self.comparer = comparer + self.setparameters(max_candidates, min_similarity, max_length) + self.usefuzzy = usefuzzy + self.inittm(store) + self.addpercentage = True + + def usable(self, unit): + """Returns whether this translation unit is usable for TM""" + #TODO: We might want to consider more attributes, such as approved, reviewed, etc. + source = unit.source + target = unit.target + if source and target and (self.usefuzzy or not unit.isfuzzy()): + if len(source) < 2: + return False + if source in self.existingunits and self.existingunits[source] == target: + return False + else: + self.existingunits[source] = target + return True + return False + + def inittm(self, stores, reverse=False): + """Initialises the memory for later use. We use simple base units for + speedup.""" + # reverse is deprectated - just use self.sort_reverse + self.existingunits = {} + self.candidates = base.TranslationStore() + + if not isinstance(stores, list): + stores = [stores] + for store in stores: + self.extendtm(store.units, store=store, sort=False) + self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse) + # print "TM initialised with %d candidates (%d to %d characters long)" % \ + # (len(self.candidates.units), len(self.candidates.units[0].source), len(self.candidates.units[-1].source)) + + def extendtm(self, units, store=None, sort=True): + """Extends the memory with extra unit(s). + + @param units: The units to add to the TM. + @param store: Optional store from where some metadata can be retrieved + and associated with each unit. + @param sort: Optional parameter that can be set to False to supress + sorting of the candidates list. This should probably only be used in + inittm(). + """ + if not isinstance(units, list): + units = [units] + candidates = filter(self.usable, units) + for candidate in candidates: + simpleunit = base.TranslationUnit("") + # We need to ensure that we don't pass multistrings futher, since + # some modules (like the native Levenshtein) can't use it. + if isinstance(candidate.source, multistring): + if len(candidate.source.strings) > 1: + simpleunit.orig_source = candidate.source + simpleunit.orig_target = candidate.target + simpleunit.source = unicode(candidate.source) + simpleunit.target = unicode(candidate.target) + else: + simpleunit.source = candidate.source + simpleunit.target = candidate.target + # If we now only get translator comments, we don't get programmer + # comments in TM suggestions (in Pootle, for example). If we get all + # notes, pot2po adds all previous comments as translator comments + # in the new po file + simpleunit.addnote(candidate.getnotes(origin="translator")) + simpleunit.fuzzy = candidate.isfuzzy() + self.candidates.units.append(simpleunit) + if sort: + self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse) + + def setparameters(self, max_candidates=10, min_similarity=75, max_length=70): + """Sets the parameters without reinitialising the tm. If a parameter + is not specified, it is set to the default, not ignored""" + self.MAX_CANDIDATES = max_candidates + self.MIN_SIMILARITY = min_similarity + self.MAX_LENGTH = max_length + + def getstoplength(self, min_similarity, text): + """Calculates a length beyond which we are not interested. + The extra fat is because we don't use plain character distance only.""" + return min(len(text) / (min_similarity/100.0), self.MAX_LENGTH) + + def getstartlength(self, min_similarity, text): + """Calculates the minimum length we are interested in. + The extra fat is because we don't use plain character distance only.""" + return max(len(text) * (min_similarity/100.0), 1) + + def matches(self, text): + """Returns a list of possible matches for given source text. + + @type text: String + @param text: The text that will be search for in the translation memory + @rtype: list + @return: a list of units with the source and target strings from the + translation memory. If self.addpercentage is true (default) the match + quality is given as a percentage in the notes. + """ + bestcandidates = [(0.0, None)]*self.MAX_CANDIDATES + #We use self.MIN_SIMILARITY, but if we already know we have max_candidates + #that are better, we can adjust min_similarity upwards for speedup + min_similarity = self.MIN_SIMILARITY + + # We want to limit our search in self.candidates, so we want to ignore + # all units with a source string that is too short or too long. We use + # a binary search to find the shortest string, from where we start our + # search in the candidates. + + # minimum source string length to be considered + startlength = self.getstartlength(min_similarity, text) + startindex = 0 + endindex = len(self.candidates.units) + while startindex < endindex: + mid = (startindex + endindex) // 2 + if sourcelen(self.candidates.units[mid]) < startlength: + startindex = mid + 1 + else: + endindex = mid + + # maximum source string length to be considered + stoplength = self.getstoplength(min_similarity, text) + lowestscore = 0 + + for candidate in self.candidates.units[startindex:]: + cmpstring = candidate.source + if len(cmpstring) > stoplength: + break + similarity = self.comparer.similarity(text, cmpstring, min_similarity) + if similarity < min_similarity: + continue + if similarity > lowestscore: + heapq.heapreplace(bestcandidates, (similarity, candidate)) + lowestscore = bestcandidates[0][0] + if lowestscore >= 100: + break + if min_similarity < lowestscore: + min_similarity = lowestscore + stoplength = self.getstoplength(min_similarity, text) + + #Remove the empty ones: + def notzero(item): + score = item[0] + return score != 0 + bestcandidates = filter(notzero, bestcandidates) + #Sort for use as a general list, and reverse so the best one is at index 0 + bestcandidates.sort(reverse=True) + return self.buildunits(bestcandidates) + + def buildunits(self, candidates): + """Builds a list of units conforming to base API, with the score in the comment""" + units = [] + for score, candidate in candidates: + if hasattr(candidate, "orig_source"): + candidate.source = candidate.orig_source + candidate.target = candidate.orig_target + newunit = po.pounit(candidate.source) + newunit.target = candidate.target + newunit.markfuzzy(candidate.fuzzy) + candidatenotes = candidate.getnotes().strip() + if candidatenotes: + newunit.addnote(candidatenotes) + if self.addpercentage: + newunit.addnote("%d%%" % score) + units.append(newunit) + return units + + +# We don't want to miss certain forms of words that only change a little +# at the end. Now we are tying this code to English, but it should serve +# us well. For example "category" should be found in "categories", +# "copy" should be found in "copied" +# +# The tuples define a regular expression to search for, and with what it +# should be replaced. +ignorepatterns = [ + ("y\s*$", "ie"), #category/categories, identify/identifies, apply/applied + ("[\s-]+", ""), #down time / downtime, pre-order / preorder + ("-", " "), #pre-order / pre order + (" ", "-"), #pre order / pre-order +] + +context_re = re.compile("\s+\(.*\)\s*$") + +class terminologymatcher(matcher): + """A matcher with settings specifically for terminology matching""" + + sort_reverse = True + + def __init__(self, store, max_candidates=10, min_similarity=75, max_length=500, comparer=None): + if comparer is None: + comparer = terminology.TerminologyComparer(max_length) + matcher.__init__(self, store, max_candidates, min_similarity=10, max_length=max_length, comparer=comparer) + self.addpercentage = False + self.match_info = {} + + def inittm(self, store): + """Normal initialisation, but convert all source strings to lower case""" + matcher.inittm(self, store) + extras = [] + for unit in self.candidates.units: + source = unit.source = context_re.sub("", unit.source).lower() + for ignorepattern in ignorepatterns: + (newterm, occurrences) = re.subn(ignorepattern[0], ignorepattern[1], source) + if occurrences: + new_unit = type(unit).buildfromunit(unit) + new_unit.source = newterm + # We mark it fuzzy to indicate that it isn't pristine + unit.markfuzzy() + extras.append(new_unit) + self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse) + if extras: + # We don't sort, so that the altered forms are at the back and + # considered last. + self.extendtm(extras, sort=False) + + def getstartlength(self, min_similarity, text): + # Let's number false matches by not working with terms of two + # characters or less + return 3 + + def getstoplength(self, min_similarity, text): + # Let's ignore terms with more than 30 characters. Perhaps someone + # gave a file with normal (long) translations + return 30 + + def usable(self, unit): + """Returns whether this translation unit is usable for terminology.""" + if not unit.istranslated(): + return False + l = len(context_re.sub("", unit.source)) + return l <= self.MAX_LENGTH and l >= self.getstartlength(None, None) + + def matches(self, text): + """Normal matching after converting text to lower case. Then replace + with the original unit to retain comments, etc.""" + text = text.lower() + comparer = self.comparer + comparer.match_info = {} + matches = [] + known = set() + for cand in self.candidates.units: + if (cand.source, cand.target) in known: + continue + source = cand.source + if comparer.similarity(text, source, self.MIN_SIMILARITY): + self.match_info[source] = {'pos': comparer.match_info[source]['pos']} + matches.append(cand) + known.add((cand.source, cand.target)) + return matches + + +# utility functions used by virtaal and tmserver to convert matching units in easily marshallable dictionaries +def unit2dict(unit): + """converts a pounit to a simple dict structure for use over the web""" + return {"source": unit.source, "target": unit.target, + "quality": _parse_quality(unit.getnotes()), "context": unit.getcontext()} + +def _parse_quality(comment): + """extracts match quality from po comments""" + quality = re.search('([0-9]+)%', comment) + if quality: + return quality.group(1) |