1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 """Class to perform translation memory matching from a store of translation units"""
22
23 import heapq
24 import re
25
26 from translate.search import lshtein
27 from translate.search import terminology
28 from translate.storage import base
29 from translate.storage import po
30 from translate.misc.multistring import multistring
31
32
34 """Returns the length of the source string"""
35 return len(unit.source)
36
37
39
40 def _matches_cmp(x, y):
41
42
43 c = cmp(match_info[x.source]['pos'], match_info[y.source]['pos'])
44 return c and c or cmp(len(y.source), len(x.source))
45 matches.sort(_matches_cmp)
46
47
49 """A class that will do matching and store configuration for the matching process"""
50
51 sort_reverse = False
52
53 - def __init__(self, store, max_candidates=10, min_similarity=75, max_length=70, comparer=None, usefuzzy=False):
54 """max_candidates is the maximum number of candidates that should be assembled,
55 min_similarity is the minimum similarity that must be attained to be included in
56 the result, comparer is an optional Comparer with similarity() function"""
57 if comparer is None:
58 comparer = lshtein.LevenshteinComparer(max_length)
59 self.comparer = comparer
60 self.setparameters(max_candidates, min_similarity, max_length)
61 self.usefuzzy = usefuzzy
62 self.inittm(store)
63 self.addpercentage = True
64
79
80 - def inittm(self, stores, reverse=False):
81 """Initialises the memory for later use. We use simple base units for
82 speedup."""
83
84 self.existingunits = {}
85 self.candidates = base.TranslationStore()
86
87 if isinstance(stores, base.TranslationStore):
88 stores = [stores]
89 for store in stores:
90 self.extendtm(store.units, store=store, sort=False)
91 self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse)
92
93
94
95 - def extendtm(self, units, store=None, sort=True):
96 """Extends the memory with extra unit(s).
97
98 @param units: The units to add to the TM.
99 @param store: Optional store from where some metadata can be retrieved
100 and associated with each unit.
101 @param sort: Optional parameter that can be set to False to supress
102 sorting of the candidates list. This should probably only be used in
103 inittm().
104 """
105 if isinstance(units, base.TranslationUnit):
106 units = [units]
107 candidates = filter(self.usable, units)
108 for candidate in candidates:
109 simpleunit = base.TranslationUnit("")
110
111
112 if isinstance(candidate.source, multistring):
113 if len(candidate.source.strings) > 1:
114 simpleunit.orig_source = candidate.source
115 simpleunit.orig_target = candidate.target
116 simpleunit.source = unicode(candidate.source)
117 simpleunit.target = unicode(candidate.target)
118 else:
119 simpleunit.source = candidate.source
120 simpleunit.target = candidate.target
121
122
123
124
125 simpleunit.addnote(candidate.getnotes(origin="translator"))
126 simpleunit.fuzzy = candidate.isfuzzy()
127 self.candidates.units.append(simpleunit)
128 if sort:
129 self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse)
130
131 - def setparameters(self, max_candidates=10, min_similarity=75, max_length=70):
132 """Sets the parameters without reinitialising the tm. If a parameter
133 is not specified, it is set to the default, not ignored"""
134 self.MAX_CANDIDATES = max_candidates
135 self.MIN_SIMILARITY = min_similarity
136 self.MAX_LENGTH = max_length
137
139 """Calculates a length beyond which we are not interested.
140 The extra fat is because we don't use plain character distance only."""
141 return min(len(text) / (min_similarity/100.0), self.MAX_LENGTH)
142
144 """Calculates the minimum length we are interested in.
145 The extra fat is because we don't use plain character distance only."""
146 return max(len(text) * (min_similarity/100.0), 1)
147
149 """Returns a list of possible matches for given source text.
150
151 @type text: String
152 @param text: The text that will be search for in the translation memory
153 @rtype: list
154 @return: a list of units with the source and target strings from the
155 translation memory. If self.addpercentage is true (default) the match
156 quality is given as a percentage in the notes.
157 """
158 bestcandidates = [(0.0, None)]*self.MAX_CANDIDATES
159
160
161 min_similarity = self.MIN_SIMILARITY
162
163
164
165
166
167
168
169 startlength = self.getstartlength(min_similarity, text)
170 startindex = 0
171 endindex = len(self.candidates.units)
172 while startindex < endindex:
173 mid = (startindex + endindex) // 2
174 if sourcelen(self.candidates.units[mid]) < startlength:
175 startindex = mid + 1
176 else:
177 endindex = mid
178
179
180 stoplength = self.getstoplength(min_similarity, text)
181 lowestscore = 0
182
183 for candidate in self.candidates.units[startindex:]:
184 cmpstring = candidate.source
185 if len(cmpstring) > stoplength:
186 break
187 similarity = self.comparer.similarity(text, cmpstring, min_similarity)
188 if similarity < min_similarity:
189 continue
190 if similarity > lowestscore:
191 heapq.heapreplace(bestcandidates, (similarity, candidate))
192 lowestscore = bestcandidates[0][0]
193 if lowestscore >= 100:
194 break
195 if min_similarity < lowestscore:
196 min_similarity = lowestscore
197 stoplength = self.getstoplength(min_similarity, text)
198
199
200 def notzero(item):
201 score = item[0]
202 return score != 0
203 bestcandidates = filter(notzero, bestcandidates)
204
205 bestcandidates.sort(reverse=True)
206 return self.buildunits(bestcandidates)
207
209 """Builds a list of units conforming to base API, with the score in the comment"""
210 units = []
211 for score, candidate in candidates:
212 if hasattr(candidate, "orig_source"):
213 candidate.source = candidate.orig_source
214 candidate.target = candidate.orig_target
215 newunit = po.pounit(candidate.source)
216 newunit.target = candidate.target
217 newunit.markfuzzy(candidate.fuzzy)
218 candidatenotes = candidate.getnotes().strip()
219 if candidatenotes:
220 newunit.addnote(candidatenotes)
221 if self.addpercentage:
222 newunit.addnote("%d%%" % score)
223 units.append(newunit)
224 return units
225
226
227
228
229
230
231
232
233
234 ignorepatterns = [
235 ("y\s*$", "ie"),
236 ("[\s-]+", ""),
237 ("-", " "),
238 (" ", "-"),
239 ]
240 ignorepatterns_re = [(re.compile(a), b) for (a, b) in ignorepatterns]
241
242 context_re = re.compile("\s+\(.*\)\s*$")
243
244
246 """A matcher with settings specifically for terminology matching"""
247
248 sort_reverse = True
249
250 - def __init__(self, store, max_candidates=10, min_similarity=75, max_length=500, comparer=None):
251 if comparer is None:
252 comparer = terminology.TerminologyComparer(max_length)
253 matcher.__init__(self, store, max_candidates, min_similarity=10, max_length=max_length, comparer=comparer)
254 self.addpercentage = False
255 self.match_info = {}
256
258 """Normal initialisation, but convert all source strings to lower case"""
259 matcher.inittm(self, store)
260 extras = []
261 for unit in self.candidates.units:
262 source = unit.source = context_re.sub("", unit.source).lower()
263 for ignorepattern_re, replacement in ignorepatterns_re:
264 (newterm, occurrences) = ignorepattern_re.subn(replacement, source)
265 if occurrences:
266 new_unit = type(unit).buildfromunit(unit)
267 new_unit.source = newterm
268
269 unit.markfuzzy()
270 extras.append(new_unit)
271 self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse)
272 if extras:
273
274
275 self.extendtm(extras, sort=False)
276
281
286
288 """Returns whether this translation unit is usable for terminology."""
289 if not unit.istranslated():
290 return False
291 l = len(context_re.sub("", unit.source))
292 return l <= self.MAX_LENGTH and l >= self.getstartlength(None, None)
293
295 """Normal matching after converting text to lower case. Then replace
296 with the original unit to retain comments, etc."""
297 text_l = len(text)
298 if text_l < self.getstartlength(0, ''):
299
300 return []
301 text = text.lower()
302 comparer = self.comparer
303 comparer.match_info = {}
304 match_info = {}
305 matches = []
306 known = set()
307
308
309
310
311
312
313
314 startindex = 0
315 endindex = len(self.candidates.units)
316 while startindex < endindex:
317 mid = (startindex + endindex) // 2
318 if sourcelen(self.candidates.units[mid]) > text_l:
319 startindex = mid + 1
320 else:
321 endindex = mid
322
323 for cand in self.candidates.units[startindex:]:
324 source = cand.source
325 if (source, cand.target) in known:
326 continue
327 if comparer.similarity(text, source, self.MIN_SIMILARITY):
328 match_info[source] = {'pos': comparer.match_info[source]['pos']}
329 matches.append(cand)
330 known.add((source, cand.target))
331
332 final_matches = []
333 lastend = 0
334 _sort_matches(matches, match_info)
335 for match in matches:
336 start_pos = match_info[match.source]['pos']
337 if start_pos < lastend:
338 continue
339 end = start_pos + len(match.source)
340
341 final_matches.append(match)
342
343
344 for m in matches:
345 if m is match:
346 continue
347 m_info = match_info[m.source]
348 m_end = m_info['pos']
349 if m_end > start_pos:
350
351 break
352 m_end += len(m.source)
353 if start_pos == m_info['pos'] and end == m_end:
354
355 final_matches.append(m)
356
357 lastend = end
358 if final_matches:
359 self.match_info = match_info
360 return final_matches
361
362
363
368
369
371 """extracts match quality from po comments"""
372 quality = re.search('([0-9]+)%', comment)
373 if quality:
374 return quality.group(1)
375