Package translate :: Package search :: Module lshtein
[hide private]
[frames] | no frames]

Source Code for Module translate.search.lshtein

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2006-2009 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # This program is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # This program is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with this program; if not, see <http://www.gnu.org/licenses/>. 
 20   
 21  """A class to calculate a similarity based on the Levenshtein 
 22  distance. See http://en.wikipedia.org/wiki/Levenshtein_distance. 
 23   
 24  If available, the python-Levenshtein package will be used which will provide 
 25  better performance as it is implemented natively. See 
 26  http://trific.ath.cx/python/levenshtein/ 
 27  """ 
 28   
 29  import math 
 30   
 31   
32 -def python_distance(a, b, stopvalue=-1):
33 """Calculates the distance for use in similarity calculation. Python 34 version.""" 35 l1 = len(a) 36 l2 = len(b) 37 if stopvalue == -1: 38 stopvalue = l2 39 current = range(l1 + 1) 40 for i in range(1, l2 + 1): 41 previous, current = current, [i] + [0] * l1 42 least = l2 43 for j in range(1, l1 + 1): 44 change = previous[j-1] 45 if a[j-1] != b[i-1]: 46 change = change + 1 47 insert = previous[j] + 1 48 delete = current[j-1] + 1 49 current[j] = min(insert, delete, change) 50 if least > current[j]: 51 least = current[j] 52 #The smallest value in the current array is the best (lowest) value 53 #that can be attained in the end if the strings are identical further 54 if least > stopvalue: 55 return least 56 57 return current[l1]
58 59
60 -def native_distance(a, b, stopvalue=0):
61 """Same as python_distance in functionality. This uses the fast C 62 version if we detected it earlier. 63 64 Note that this does not support arbitrary sequence types, but only 65 string types.""" 66 return Levenshtein.distance(a, b)
67 68 try: 69 import Levenshtein as Levenshtein 70 distance = native_distance 71 except ImportError: 72 import logging 73 logging.warning("Python-Levenshtein not found. Continuing with built-in (slower) fuzzy matching.") 74 distance = python_distance 75 76
77 -class LevenshteinComparer:
78
79 - def __init__(self, max_len=200):
80 self.MAX_LEN = max_len
81
82 - def similarity(self, a, b, stoppercentage=40):
83 similarity = self.similarity_real(a, b, stoppercentage) 84 measurements = 1 85 86 # chr_a = segment.characters(a) 87 # chr_b = segment.characters(b) 88 # if chr_a and chr_b and abs(len(chr_a) - len(a)) + abs(len(chr_b) - len(b)): 89 # similarity += self.similarity_real(chr_a, chr_b, stoppercentage) 90 # measurements += 1 91 # else: 92 # similarity *= 2 93 # measurements += 1 94 # 95 # wrd_a = segment.words(a) 96 # wrd_b = segment.words(b) 97 # if len(wrd_a) + len(wrd_b) > 2: 98 # similarity += self.similarity_real(wrd_a, wrd_b, 0) 99 # measurements += 1 100 return similarity / measurements
101
102 - def similarity_real(self, a, b, stoppercentage=40):
103 """Returns the similarity between a and b based on Levenshtein distance. It 104 can stop prematurely as soon as it sees that a and b will be no simmilar than 105 the percentage specified in stoppercentage. 106 107 The Levenshtein distance is calculated, but the following should be noted: 108 - Only the first MAX_LEN characters are considered. Long strings differing 109 at the end will therefore seem to match better than they should. See the 110 use of the variable penalty to lessen the effect of this. 111 - Strings with widely different lengths give the opportunity for shortcut. 112 This is by definition of the Levenshtein distance: the distance will be 113 at least as much as the difference in string length. 114 - Calculation is stopped as soon as a similarity of stoppercentage becomes 115 unattainable. See the use of the variable stopvalue. 116 - Implementation uses memory O(min(len(a), len(b)) 117 - Excecution time is O(len(a)*len(b)) 118 """ 119 l1, l2 = len(a), len(b) 120 if l1 == 0 or l2 == 0: 121 return 0 122 #Let's make l1 the smallest 123 if l1 > l2: 124 l1, l2 = l2, l1 125 a, b = b, a 126 127 #maxsimilarity is the maximum similarity that can be attained as constrained 128 #by the difference in string length 129 maxsimilarity = 100 - 100.0*abs(l1 - l2)/l2 130 if maxsimilarity < stoppercentage: 131 return maxsimilarity * 1.0 132 133 #Let's penalise the score in cases where we shorten strings 134 penalty = 0 135 if l2 > self.MAX_LEN: 136 b = b[:self.MAX_LEN] 137 l2 = self.MAX_LEN 138 penalty += 7 139 if l1 > self.MAX_LEN: 140 a = a[:self.MAX_LEN] 141 l1 = self.MAX_LEN 142 penalty += 7 143 144 #The actual value in the array that would represent a giveup situation: 145 stopvalue = math.ceil((100.0 - stoppercentage)/100 * l2) 146 dist = distance(a, b, stopvalue) 147 if dist > stopvalue: 148 return stoppercentage - 1.0 149 150 #If MAX_LEN came into play, we consider the calculated distance to be 151 #representative of the distance between the whole, untrimmed strings 152 if dist != 0: 153 penalty = 0 154 return 100 - (dist*1.0/l2)*100 - penalty
155 156 157 if __name__ == "__main__": 158 from sys import argv 159 comparer = LevenshteinComparer() 160 print "Similarity:\n%s" % comparer.similarity(argv[1], argv[2], 50) 161