translate-toolkit-1.5.1/translate/search/match.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2006-2009 Zuza Software Foundation
#
# This file is part of the Translate Toolkit.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.

"""Class to perform translation memory matching from a store of translation units"""

import heapq
import re

from translate.search import lshtein
from translate.search import terminology
from translate.storage import base
from translate.storage import po
from translate.misc.multistring import multistring


def sourcelen(unit):
    """Returns the length of the source string"""
    return len(unit.source)


class matcher(object):
    """A class that will do matching and store configuration for the matching process"""

    sort_reverse = False

    def __init__(self, store, max_candidates=10, min_similarity=75, max_length=70, comparer=None, usefuzzy=False):
        """max_candidates is the maximum number of candidates that should be assembled,
        min_similarity is the minimum similarity that must be attained to be included in
        the result, comparer is an optional Comparer with similarity() function"""
        if comparer is None:
            comparer = lshtein.LevenshteinComparer(max_length)
        self.comparer = comparer
        self.setparameters(max_candidates, min_similarity, max_length)
        self.usefuzzy = usefuzzy
        self.inittm(store)
        self.addpercentage = True

    def usable(self, unit):
        """Returns whether this translation unit is usable for TM"""
        #TODO: We might want to consider more attributes, such as approved, reviewed, etc.
        source = unit.source
        target = unit.target
        if source and target and (self.usefuzzy or not unit.isfuzzy()):
            if len(source) < 2:
                return False
            if source in self.existingunits and self.existingunits[source] == target:
                return False
            else:
                self.existingunits[source] = target
                return True
        return False

    def inittm(self, stores, reverse=False):
        """Initialises the memory for later use. We use simple base units for
        speedup."""
        # reverse is deprectated - just use self.sort_reverse
        self.existingunits = {}
        self.candidates = base.TranslationStore()

        if not isinstance(stores, list):
            stores = [stores]
        for store in stores:
            self.extendtm(store.units, store=store, sort=False)
        self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse)
        # print "TM initialised with %d candidates (%d to %d characters long)" % \
        #        (len(self.candidates.units), len(self.candidates.units[0].source), len(self.candidates.units[-1].source))

    def extendtm(self, units, store=None, sort=True):
        """Extends the memory with extra unit(s).

        @param units: The units to add to the TM.
        @param store: Optional store from where some metadata can be retrieved
        and associated with each unit.
        @param sort:  Optional parameter that can be set to False to supress
        sorting of the candidates list. This should probably only be used in
        inittm().
        """
        if not isinstance(units, list):
            units = [units]
        candidates = filter(self.usable, units)
        for candidate in candidates:
            simpleunit = base.TranslationUnit("")
            # We need to ensure that we don't pass multistrings futher, since
            # some modules (like the native Levenshtein) can't use it.
            if isinstance(candidate.source, multistring):
                if len(candidate.source.strings) > 1:
                    simpleunit.orig_source = candidate.source
                    simpleunit.orig_target = candidate.target
                simpleunit.source = unicode(candidate.source)
                simpleunit.target = unicode(candidate.target)
            else:
                simpleunit.source = candidate.source
                simpleunit.target = candidate.target
            # If we now only get translator comments, we don't get programmer
            # comments in TM suggestions (in Pootle, for example). If we get all
            # notes, pot2po adds all previous comments as translator comments
            # in the new po file
            simpleunit.addnote(candidate.getnotes(origin="translator"))
            simpleunit.fuzzy = candidate.isfuzzy()
            self.candidates.units.append(simpleunit)
        if sort:
            self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse)

    def setparameters(self, max_candidates=10, min_similarity=75, max_length=70):
        """Sets the parameters without reinitialising the tm. If a parameter
        is not specified, it is set to the default, not ignored"""
        self.MAX_CANDIDATES = max_candidates
        self.MIN_SIMILARITY = min_similarity
        self.MAX_LENGTH = max_length

    def getstoplength(self, min_similarity, text):
        """Calculates a length beyond which we are not interested.
        The extra fat is because we don't use plain character distance only."""
        return min(len(text) / (min_similarity/100.0), self.MAX_LENGTH)

    def getstartlength(self, min_similarity, text):
        """Calculates the minimum length we are interested in.
        The extra fat is because we don't use plain character distance only."""
        return max(len(text) * (min_similarity/100.0), 1)

    def matches(self, text):
        """Returns a list of possible matches for given source text.

        @type text: String
        @param text: The text that will be search for in the translation memory
        @rtype: list
        @return: a list of units with the source and target strings from the
        translation memory. If self.addpercentage is true (default) the match
        quality is given as a percentage in the notes.
        """
        bestcandidates = [(0.0, None)]*self.MAX_CANDIDATES
        #We use self.MIN_SIMILARITY, but if we already know we have max_candidates
        #that are better, we can adjust min_similarity upwards for speedup
        min_similarity = self.MIN_SIMILARITY

        # We want to limit our search in self.candidates, so we want to ignore
        # all units with a source string that is too short or too long. We use
        # a binary search to find the shortest string, from where we start our
        # search in the candidates.

        # minimum source string length to be considered
        startlength = self.getstartlength(min_similarity, text)
        startindex = 0
        endindex = len(self.candidates.units)
        while startindex < endindex:
            mid = (startindex + endindex) // 2
            if sourcelen(self.candidates.units[mid]) < startlength:
                startindex = mid + 1
            else:
                endindex = mid

        # maximum source string length to be considered
        stoplength = self.getstoplength(min_similarity, text)
        lowestscore = 0

        for candidate in self.candidates.units[startindex:]:
            cmpstring = candidate.source
            if len(cmpstring) > stoplength:
                break
            similarity = self.comparer.similarity(text, cmpstring, min_similarity)
            if similarity < min_similarity:
                continue
            if similarity > lowestscore:
                heapq.heapreplace(bestcandidates, (similarity, candidate))
                lowestscore = bestcandidates[0][0]
                if lowestscore >= 100:
                    break
                if min_similarity < lowestscore:
                    min_similarity = lowestscore
                    stoplength = self.getstoplength(min_similarity, text)

        #Remove the empty ones:
        def notzero(item):
            score = item[0]
            return score != 0
        bestcandidates = filter(notzero, bestcandidates)
        #Sort for use as a general list, and reverse so the best one is at index 0
        bestcandidates.sort(reverse=True)
        return self.buildunits(bestcandidates)

    def buildunits(self, candidates):
        """Builds a list of units conforming to base API, with the score in the comment"""
        units = []
        for score, candidate in candidates:
            if hasattr(candidate, "orig_source"):
                candidate.source = candidate.orig_source
                candidate.target = candidate.orig_target
            newunit = po.pounit(candidate.source)
            newunit.target = candidate.target
            newunit.markfuzzy(candidate.fuzzy)
            candidatenotes = candidate.getnotes().strip()
            if candidatenotes:
                newunit.addnote(candidatenotes)
            if self.addpercentage:
                newunit.addnote("%d%%" % score)
            units.append(newunit)
        return units


# We don't want to miss certain forms of words that only change a little
# at the end. Now we are tying this code to English, but it should serve
# us well. For example "category" should be found in "categories",
# "copy" should be found in "copied"
#
# The tuples define a regular expression to search for, and with what it
# should be replaced.
ignorepatterns = [
    ("y\s*$", "ie"),          #category/categories, identify/identifies, apply/applied
    ("[\s-]+", ""),           #down time / downtime, pre-order / preorder
    ("-", " "),               #pre-order / pre order
    (" ", "-"),               #pre order / pre-order
]

context_re = re.compile("\s+\(.*\)\s*$")

class terminologymatcher(matcher):
    """A matcher with settings specifically for terminology matching"""

    sort_reverse = True

    def __init__(self, store, max_candidates=10, min_similarity=75, max_length=500, comparer=None):
        if comparer is None:
            comparer = terminology.TerminologyComparer(max_length)
        matcher.__init__(self, store, max_candidates, min_similarity=10, max_length=max_length, comparer=comparer)
        self.addpercentage = False
        self.match_info = {}

    def inittm(self, store):
        """Normal initialisation, but convert all source strings to lower case"""
        matcher.inittm(self, store)
        extras = []
        for unit in self.candidates.units:
            source = unit.source = context_re.sub("", unit.source).lower()
            for ignorepattern in ignorepatterns:
                (newterm, occurrences) = re.subn(ignorepattern[0], ignorepattern[1], source)
                if occurrences:
                    new_unit = type(unit).buildfromunit(unit)
                    new_unit.source = newterm
                    # We mark it fuzzy to indicate that it isn't pristine
                    unit.markfuzzy()
                    extras.append(new_unit)
        self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse)
        if extras:
            # We don't sort, so that the altered forms are at the back and
            # considered last.
            self.extendtm(extras, sort=False)

    def getstartlength(self, min_similarity, text):
        # Let's number false matches by not working with terms of two
        # characters or less
        return 3

    def getstoplength(self, min_similarity, text):
        # Let's ignore terms with more than 30 characters. Perhaps someone
        # gave a file with normal (long) translations
        return 30

    def usable(self, unit):
        """Returns whether this translation unit is usable for terminology."""
        if not unit.istranslated():
            return False
        l = len(context_re.sub("", unit.source))
        return l <= self.MAX_LENGTH and l >= self.getstartlength(None, None)

    def matches(self, text):
        """Normal matching after converting text to lower case. Then replace
        with the original unit to retain comments, etc."""
        text = text.lower()
        comparer = self.comparer
        comparer.match_info = {}
        matches = []
        known = set()
        for cand in self.candidates.units:
            if (cand.source, cand.target) in known:
                continue
            source = cand.source
            if comparer.similarity(text, source, self.MIN_SIMILARITY):
                self.match_info[source] = {'pos': comparer.match_info[source]['pos']}
                matches.append(cand)
                known.add((cand.source, cand.target))
        return matches


# utility functions used by virtaal and tmserver to convert matching units in easily marshallable dictionaries
def unit2dict(unit):
    """converts a pounit to a simple dict structure for use over the web"""
    return {"source": unit.source, "target": unit.target,
            "quality": _parse_quality(unit.getnotes()), "context": unit.getcontext()}

def _parse_quality(comment):
    """extracts match quality from po comments"""
    quality = re.search('([0-9]+)%', comment)
    if quality:
        return quality.group(1)