#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright 2006-2009 Zuza Software Foundation # # This file is part of translate. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, see . """A class to calculate a similarity based on the Levenshtein distance. See http://en.wikipedia.org/wiki/Levenshtein_distance. If available, the python-Levenshtein package will be used which will provide better performance as it is implemented natively. See http://trific.ath.cx/python/levenshtein/ """ import math import sys def python_distance(a, b, stopvalue=-1): """Calculates the distance for use in similarity calculation. Python version.""" l1 = len(a) l2 = len(b) if stopvalue == -1: stopvalue = l2 current = range(l1+1) for i in range(1, l2+1): previous, current = current, [i]+[0]*l1 least = l2 for j in range(1, l1 + 1): change = previous[j-1] if a[j-1] != b[i-1]: change = change + 1 insert = previous[j] + 1 delete = current[j-1] + 1 current[j] = min(insert, delete, change) if least > current[j]: least = current[j] #The smallest value in the current array is the best (lowest) value #that can be attained in the end if the strings are identical further if least > stopvalue: return least return current[l1] def native_distance(a, b, stopvalue=0): """Same as python_distance in functionality. This uses the fast C version if we detected it earlier. Note that this does not support arbitrary sequence types, but only string types.""" return Levenshtein.distance(a, b) try: import Levenshtein as Levenshtein distance = native_distance except Exception: import logging logging.warning("Python-Levenshtein not found. Continuing with built-in (slower) fuzzy matching.") distance = python_distance class LevenshteinComparer: def __init__(self, max_len=200): self.MAX_LEN = max_len def similarity(self, a, b, stoppercentage=40): similarity = self.similarity_real(a, b, stoppercentage) measurements = 1 # chr_a = segment.characters(a) # chr_b = segment.characters(b) # if chr_a and chr_b and abs(len(chr_a) - len(a)) + abs(len(chr_b) - len(b)): # similarity += self.similarity_real(chr_a, chr_b, stoppercentage) # measurements += 1 # else: # similarity *= 2 # measurements += 1 # # wrd_a = segment.words(a) # wrd_b = segment.words(b) # if len(wrd_a) + len(wrd_b) > 2: # similarity += self.similarity_real(wrd_a, wrd_b, 0) # measurements += 1 return similarity / measurements def similarity_real(self, a, b, stoppercentage=40): """Returns the similarity between a and b based on Levenshtein distance. It can stop prematurely as soon as it sees that a and b will be no simmilar than the percentage specified in stoppercentage. The Levenshtein distance is calculated, but the following should be noted: - Only the first MAX_LEN characters are considered. Long strings differing at the end will therefore seem to match better than they should. See the use of the variable penalty to lessen the effect of this. - Strings with widely different lengths give the opportunity for shortcut. This is by definition of the Levenshtein distance: the distance will be at least as much as the difference in string length. - Calculation is stopped as soon as a similarity of stoppercentage becomes unattainable. See the use of the variable stopvalue. - Implementation uses memory O(min(len(a), len(b)) - Excecution time is O(len(a)*len(b)) """ l1, l2 = len(a), len(b) if l1 == 0 or l2 == 0: return 0 #Let's make l1 the smallest if l1 > l2: l1, l2 = l2, l1 a, b = b, a #maxsimilarity is the maximum similarity that can be attained as constrained #by the difference in string length maxsimilarity = 100 - 100.0*abs(l1 - l2)/l2 if maxsimilarity < stoppercentage: return maxsimilarity * 1.0 #Let's penalise the score in cases where we shorten strings penalty = 0 if l2 > self.MAX_LEN: b = b[:self.MAX_LEN] l2 = self.MAX_LEN penalty += 7 if l1 > self.MAX_LEN: a = a[:self.MAX_LEN] l1 = self.MAX_LEN penalty += 7 #The actual value in the array that would represent a giveup situation: stopvalue = math.ceil((100.0 - stoppercentage)/100 * l2) dist = distance(a, b, stopvalue) if dist > stopvalue: return stoppercentage - 1.0 #If MAX_LEN came into play, we consider the calculated distance to be #representative of the distance between the whole, untrimmed strings if dist != 0: penalty = 0 return 100 - (dist*1.0/l2)*100 - penalty if __name__ == "__main__": from sys import argv comparer = LevenshteinComparer() print "Similarity:\n%s" % comparer.similarity(argv[1], argv[2], 50)