blob: deb73616a820c4d4753f4beec23b6cf5bf9bf66f [file] [log] [blame]
Tim Peters9ae21482001-02-10 08:00:53 +00001#! /usr/bin/env python
2
3"""
4Module difflib -- helpers for computing deltas between objects.
5
6Function get_close_matches(word, possibilities, n=3, cutoff=0.6):
7
8 Use SequenceMatcher to return list of the best "good enough" matches.
9
10 word is a sequence for which close matches are desired (typically a
11 string).
12
13 possibilities is a list of sequences against which to match word
14 (typically a list of strings).
15
16 Optional arg n (default 3) is the maximum number of close matches to
17 return. n must be > 0.
18
19 Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities
20 that don't score at least that similar to word are ignored.
21
22 The best (no more than n) matches among the possibilities are returned
23 in a list, sorted by similarity score, most similar first.
24
25 >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
26 ['apple', 'ape']
27 >>> import keyword
28 >>> get_close_matches("wheel", keyword.kwlist)
29 ['while']
30 >>> get_close_matches("apple", keyword.kwlist)
31 []
32 >>> get_close_matches("accept", keyword.kwlist)
33 ['except']
34
35Class SequenceMatcher
36
37SequenceMatcher is a flexible class for comparing pairs of sequences of any
38type, so long as the sequence elements are hashable. The basic algorithm
39predates, and is a little fancier than, an algorithm published in the late
401980's by Ratcliff and Obershelp under the hyperbolic name "gestalt pattern
41matching". The basic idea is to find the longest contiguous matching
42subsequence that contains no "junk" elements (R-O doesn't address junk).
43The same idea is then applied recursively to the pieces of the sequences to
44the left and to the right of the matching subsequence. This does not yield
45minimal edit sequences, but does tend to yield matches that "look right"
46to people.
47
48Example, comparing two strings, and considering blanks to be "junk":
49
50>>> s = SequenceMatcher(lambda x: x == " ",
51... "private Thread currentThread;",
52... "private volatile Thread currentThread;")
53>>>
54
55.ratio() returns a float in [0, 1], measuring the "similarity" of the
56sequences. As a rule of thumb, a .ratio() value over 0.6 means the
57sequences are close matches:
58
59>>> print round(s.ratio(), 3)
600.866
61>>>
62
63If you're only interested in where the sequences match,
64.get_matching_blocks() is handy:
65
66>>> for block in s.get_matching_blocks():
67... print "a[%d] and b[%d] match for %d elements" % block
68a[0] and b[0] match for 8 elements
69a[8] and b[17] match for 6 elements
70a[14] and b[23] match for 15 elements
71a[29] and b[38] match for 0 elements
72
73Note that the last tuple returned by .get_matching_blocks() is always a
74dummy, (len(a), len(b), 0), and this is the only case in which the last
75tuple element (number of elements matched) is 0.
76
77If you want to know how to change the first sequence into the second, use
78.get_opcodes():
79
80>>> for opcode in s.get_opcodes():
81... print "%6s a[%d:%d] b[%d:%d]" % opcode
82 equal a[0:8] b[0:8]
83insert a[8:8] b[8:17]
84 equal a[8:14] b[17:23]
85 equal a[14:29] b[23:38]
86
87See Tools/scripts/ndiff.py for a fancy human-friendly file differencer,
88which uses SequenceMatcher both to view files as sequences of lines, and
89lines as sequences of characters.
90
91See also function get_close_matches() in this module, which shows how
92simple code building on SequenceMatcher can be used to do useful work.
93
94Timing: Basic R-O is cubic time worst case and quadratic time expected
Tim Peters754ba582001-02-20 11:24:35 +000095case. SequenceMatcher is quadratic time for the worst case and has
96expected-case behavior dependent in a complicated way on how many
97elements the sequences have in common; best case time is linear.
Tim Peters9ae21482001-02-10 08:00:53 +000098
99SequenceMatcher methods:
100
101__init__(isjunk=None, a='', b='')
102 Construct a SequenceMatcher.
103
104 Optional arg isjunk is None (the default), or a one-argument function
105 that takes a sequence element and returns true iff the element is junk.
106 None is equivalent to passing "lambda x: 0", i.e. no elements are
Fred Drakef1da6282001-02-19 19:30:05 +0000107 considered to be junk. For example, pass
Tim Peters9ae21482001-02-10 08:00:53 +0000108 lambda x: x in " \\t"
109 if you're comparing lines as sequences of characters, and don't want to
110 synch up on blanks or hard tabs.
111
112 Optional arg a is the first of two sequences to be compared. By
113 default, an empty string. The elements of a must be hashable.
114
115 Optional arg b is the second of two sequences to be compared. By
116 default, an empty string. The elements of b must be hashable.
117
118set_seqs(a, b)
119 Set the two sequences to be compared.
120
121 >>> s = SequenceMatcher()
122 >>> s.set_seqs("abcd", "bcde")
123 >>> s.ratio()
124 0.75
125
126set_seq1(a)
127 Set the first sequence to be compared.
128
129 The second sequence to be compared is not changed.
130
131 >>> s = SequenceMatcher(None, "abcd", "bcde")
132 >>> s.ratio()
133 0.75
134 >>> s.set_seq1("bcde")
135 >>> s.ratio()
136 1.0
137 >>>
138
139 SequenceMatcher computes and caches detailed information about the
140 second sequence, so if you want to compare one sequence S against many
141 sequences, use .set_seq2(S) once and call .set_seq1(x) repeatedly for
142 each of the other sequences.
143
144 See also set_seqs() and set_seq2().
145
146set_seq2(b)
147 Set the second sequence to be compared.
148
149 The first sequence to be compared is not changed.
150
151 >>> s = SequenceMatcher(None, "abcd", "bcde")
152 >>> s.ratio()
153 0.75
154 >>> s.set_seq2("abcd")
155 >>> s.ratio()
156 1.0
157 >>>
158
159 SequenceMatcher computes and caches detailed information about the
160 second sequence, so if you want to compare one sequence S against many
161 sequences, use .set_seq2(S) once and call .set_seq1(x) repeatedly for
162 each of the other sequences.
163
164 See also set_seqs() and set_seq1().
165
166find_longest_match(alo, ahi, blo, bhi)
167 Find longest matching block in a[alo:ahi] and b[blo:bhi].
168
169 If isjunk is not defined:
170
171 Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
172 alo <= i <= i+k <= ahi
173 blo <= j <= j+k <= bhi
174 and for all (i',j',k') meeting those conditions,
175 k >= k'
176 i <= i'
177 and if i == i', j <= j'
178
179 In other words, of all maximal matching blocks, return one that starts
180 earliest in a, and of all those maximal matching blocks that start
181 earliest in a, return the one that starts earliest in b.
182
183 >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
184 >>> s.find_longest_match(0, 5, 0, 9)
185 (0, 4, 5)
186
187 If isjunk is defined, first the longest matching block is determined as
188 above, but with the additional restriction that no junk element appears
189 in the block. Then that block is extended as far as possible by
190 matching (only) junk elements on both sides. So the resulting block
191 never matches on junk except as identical junk happens to be adjacent
192 to an "interesting" match.
193
194 Here's the same example as before, but considering blanks to be junk.
195 That prevents " abcd" from matching the " abcd" at the tail end of the
196 second sequence directly. Instead only the "abcd" can match, and
197 matches the leftmost "abcd" in the second sequence:
198
199 >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
200 >>> s.find_longest_match(0, 5, 0, 9)
201 (1, 0, 4)
202
203 If no blocks match, return (alo, blo, 0).
204
205 >>> s = SequenceMatcher(None, "ab", "c")
206 >>> s.find_longest_match(0, 2, 0, 1)
207 (0, 0, 0)
208
209get_matching_blocks()
210 Return list of triples describing matching subsequences.
211
212 Each triple is of the form (i, j, n), and means that
213 a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in i
214 and in j.
215
216 The last triple is a dummy, (len(a), len(b), 0), and is the only triple
217 with n==0.
218
219 >>> s = SequenceMatcher(None, "abxcd", "abcd")
220 >>> s.get_matching_blocks()
221 [(0, 0, 2), (3, 2, 2), (5, 4, 0)]
222
223get_opcodes()
224 Return list of 5-tuples describing how to turn a into b.
225
226 Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple has
227 i1 == j1 == 0, and remaining tuples have i1 == the i2 from the tuple
228 preceding it, and likewise for j1 == the previous j2.
229
230 The tags are strings, with these meanings:
231
232 'replace': a[i1:i2] should be replaced by b[j1:j2]
233 'delete': a[i1:i2] should be deleted.
234 Note that j1==j2 in this case.
235 'insert': b[j1:j2] should be inserted at a[i1:i1].
236 Note that i1==i2 in this case.
237 'equal': a[i1:i2] == b[j1:j2]
238
239 >>> a = "qabxcd"
240 >>> b = "abycdf"
241 >>> s = SequenceMatcher(None, a, b)
242 >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
243 ... print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
244 ... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
245 delete a[0:1] (q) b[0:0] ()
246 equal a[1:3] (ab) b[0:2] (ab)
247 replace a[3:4] (x) b[2:3] (y)
248 equal a[4:6] (cd) b[3:5] (cd)
249 insert a[6:6] () b[5:6] (f)
250
251ratio()
252 Return a measure of the sequences' similarity (float in [0,1]).
253
254 Where T is the total number of elements in both sequences, and M is the
255 number of matches, this is 2,0*M / T. Note that this is 1 if the
256 sequences are identical, and 0 if they have nothing in common.
257
258 .ratio() is expensive to compute if you haven't already computed
259 .get_matching_blocks() or .get_opcodes(), in which case you may want to
260 try .quick_ratio() or .real_quick_ratio() first to get an upper bound.
261
262 >>> s = SequenceMatcher(None, "abcd", "bcde")
263 >>> s.ratio()
264 0.75
265 >>> s.quick_ratio()
266 0.75
267 >>> s.real_quick_ratio()
268 1.0
269
270quick_ratio()
271 Return an upper bound on .ratio() relatively quickly.
272
273 This isn't defined beyond that it is an upper bound on .ratio(), and
274 is faster to compute.
275
276real_quick_ratio():
277 Return an upper bound on ratio() very quickly.
278
279 This isn't defined beyond that it is an upper bound on .ratio(), and
280 is faster to compute than either .ratio() or .quick_ratio().
281"""
282
283TRACE = 0
284
285class SequenceMatcher:
286 def __init__(self, isjunk=None, a='', b=''):
287 """Construct a SequenceMatcher.
288
289 Optional arg isjunk is None (the default), or a one-argument
290 function that takes a sequence element and returns true iff the
291 element is junk. None is equivalent to passing "lambda x: 0", i.e.
Fred Drakef1da6282001-02-19 19:30:05 +0000292 no elements are considered to be junk. For example, pass
Tim Peters9ae21482001-02-10 08:00:53 +0000293 lambda x: x in " \\t"
294 if you're comparing lines as sequences of characters, and don't
295 want to synch up on blanks or hard tabs.
296
297 Optional arg a is the first of two sequences to be compared. By
298 default, an empty string. The elements of a must be hashable. See
299 also .set_seqs() and .set_seq1().
300
301 Optional arg b is the second of two sequences to be compared. By
Fred Drakef1da6282001-02-19 19:30:05 +0000302 default, an empty string. The elements of b must be hashable. See
Tim Peters9ae21482001-02-10 08:00:53 +0000303 also .set_seqs() and .set_seq2().
304 """
305
306 # Members:
307 # a
308 # first sequence
309 # b
310 # second sequence; differences are computed as "what do
311 # we need to do to 'a' to change it into 'b'?"
312 # b2j
313 # for x in b, b2j[x] is a list of the indices (into b)
314 # at which x appears; junk elements do not appear
315 # b2jhas
316 # b2j.has_key
317 # fullbcount
318 # for x in b, fullbcount[x] == the number of times x
319 # appears in b; only materialized if really needed (used
320 # only for computing quick_ratio())
321 # matching_blocks
322 # a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
323 # ascending & non-overlapping in i and in j; terminated by
324 # a dummy (len(a), len(b), 0) sentinel
325 # opcodes
326 # a list of (tag, i1, i2, j1, j2) tuples, where tag is
327 # one of
328 # 'replace' a[i1:i2] should be replaced by b[j1:j2]
329 # 'delete' a[i1:i2] should be deleted
330 # 'insert' b[j1:j2] should be inserted
331 # 'equal' a[i1:i2] == b[j1:j2]
332 # isjunk
333 # a user-supplied function taking a sequence element and
334 # returning true iff the element is "junk" -- this has
335 # subtle but helpful effects on the algorithm, which I'll
336 # get around to writing up someday <0.9 wink>.
337 # DON'T USE! Only __chain_b uses this. Use isbjunk.
338 # isbjunk
339 # for x in b, isbjunk(x) == isjunk(x) but much faster;
340 # it's really the has_key method of a hidden dict.
341 # DOES NOT WORK for x in a!
342
343 self.isjunk = isjunk
344 self.a = self.b = None
345 self.set_seqs(a, b)
346
347 def set_seqs(self, a, b):
348 """Set the two sequences to be compared.
349
350 >>> s = SequenceMatcher()
351 >>> s.set_seqs("abcd", "bcde")
352 >>> s.ratio()
353 0.75
354 """
355
356 self.set_seq1(a)
357 self.set_seq2(b)
358
359 def set_seq1(self, a):
360 """Set the first sequence to be compared.
361
362 The second sequence to be compared is not changed.
363
364 >>> s = SequenceMatcher(None, "abcd", "bcde")
365 >>> s.ratio()
366 0.75
367 >>> s.set_seq1("bcde")
368 >>> s.ratio()
369 1.0
370 >>>
371
372 SequenceMatcher computes and caches detailed information about the
373 second sequence, so if you want to compare one sequence S against
374 many sequences, use .set_seq2(S) once and call .set_seq1(x)
375 repeatedly for each of the other sequences.
376
377 See also set_seqs() and set_seq2().
378 """
379
380 if a is self.a:
381 return
382 self.a = a
383 self.matching_blocks = self.opcodes = None
384
385 def set_seq2(self, b):
386 """Set the second sequence to be compared.
387
388 The first sequence to be compared is not changed.
389
390 >>> s = SequenceMatcher(None, "abcd", "bcde")
391 >>> s.ratio()
392 0.75
393 >>> s.set_seq2("abcd")
394 >>> s.ratio()
395 1.0
396 >>>
397
398 SequenceMatcher computes and caches detailed information about the
399 second sequence, so if you want to compare one sequence S against
400 many sequences, use .set_seq2(S) once and call .set_seq1(x)
401 repeatedly for each of the other sequences.
402
403 See also set_seqs() and set_seq1().
404 """
405
406 if b is self.b:
407 return
408 self.b = b
409 self.matching_blocks = self.opcodes = None
410 self.fullbcount = None
411 self.__chain_b()
412
413 # For each element x in b, set b2j[x] to a list of the indices in
414 # b where x appears; the indices are in increasing order; note that
415 # the number of times x appears in b is len(b2j[x]) ...
416 # when self.isjunk is defined, junk elements don't show up in this
417 # map at all, which stops the central find_longest_match method
418 # from starting any matching block at a junk element ...
419 # also creates the fast isbjunk function ...
420 # note that this is only called when b changes; so for cross-product
421 # kinds of matches, it's best to call set_seq2 once, then set_seq1
422 # repeatedly
423
424 def __chain_b(self):
425 # Because isjunk is a user-defined (not C) function, and we test
426 # for junk a LOT, it's important to minimize the number of calls.
427 # Before the tricks described here, __chain_b was by far the most
428 # time-consuming routine in the whole module! If anyone sees
429 # Jim Roskind, thank him again for profile.py -- I never would
430 # have guessed that.
431 # The first trick is to build b2j ignoring the possibility
432 # of junk. I.e., we don't call isjunk at all yet. Throwing
433 # out the junk later is much cheaper than building b2j "right"
434 # from the start.
435 b = self.b
436 self.b2j = b2j = {}
437 self.b2jhas = b2jhas = b2j.has_key
438 for i in xrange(len(b)):
439 elt = b[i]
440 if b2jhas(elt):
441 b2j[elt].append(i)
442 else:
443 b2j[elt] = [i]
444
445 # Now b2j.keys() contains elements uniquely, and especially when
446 # the sequence is a string, that's usually a good deal smaller
447 # than len(string). The difference is the number of isjunk calls
448 # saved.
449 isjunk, junkdict = self.isjunk, {}
450 if isjunk:
451 for elt in b2j.keys():
452 if isjunk(elt):
453 junkdict[elt] = 1 # value irrelevant; it's a set
454 del b2j[elt]
455
456 # Now for x in b, isjunk(x) == junkdict.has_key(x), but the
457 # latter is much faster. Note too that while there may be a
458 # lot of junk in the sequence, the number of *unique* junk
459 # elements is probably small. So the memory burden of keeping
460 # this dict alive is likely trivial compared to the size of b2j.
461 self.isbjunk = junkdict.has_key
462
463 def find_longest_match(self, alo, ahi, blo, bhi):
464 """Find longest matching block in a[alo:ahi] and b[blo:bhi].
465
466 If isjunk is not defined:
467
468 Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
469 alo <= i <= i+k <= ahi
470 blo <= j <= j+k <= bhi
471 and for all (i',j',k') meeting those conditions,
472 k >= k'
473 i <= i'
474 and if i == i', j <= j'
475
476 In other words, of all maximal matching blocks, return one that
477 starts earliest in a, and of all those maximal matching blocks that
478 start earliest in a, return the one that starts earliest in b.
479
480 >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
481 >>> s.find_longest_match(0, 5, 0, 9)
482 (0, 4, 5)
483
484 If isjunk is defined, first the longest matching block is
485 determined as above, but with the additional restriction that no
486 junk element appears in the block. Then that block is extended as
487 far as possible by matching (only) junk elements on both sides. So
488 the resulting block never matches on junk except as identical junk
489 happens to be adjacent to an "interesting" match.
490
491 Here's the same example as before, but considering blanks to be
492 junk. That prevents " abcd" from matching the " abcd" at the tail
493 end of the second sequence directly. Instead only the "abcd" can
494 match, and matches the leftmost "abcd" in the second sequence:
495
496 >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
497 >>> s.find_longest_match(0, 5, 0, 9)
498 (1, 0, 4)
499
500 If no blocks match, return (alo, blo, 0).
501
502 >>> s = SequenceMatcher(None, "ab", "c")
503 >>> s.find_longest_match(0, 2, 0, 1)
504 (0, 0, 0)
505 """
506
507 # CAUTION: stripping common prefix or suffix would be incorrect.
508 # E.g.,
509 # ab
510 # acab
511 # Longest matching block is "ab", but if common prefix is
512 # stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
513 # strip, so ends up claiming that ab is changed to acab by
514 # inserting "ca" in the middle. That's minimal but unintuitive:
515 # "it's obvious" that someone inserted "ac" at the front.
516 # Windiff ends up at the same place as diff, but by pairing up
517 # the unique 'b's and then matching the first two 'a's.
518
519 a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
520 besti, bestj, bestsize = alo, blo, 0
521 # find longest junk-free match
522 # during an iteration of the loop, j2len[j] = length of longest
523 # junk-free match ending with a[i-1] and b[j]
524 j2len = {}
525 nothing = []
526 for i in xrange(alo, ahi):
527 # look at all instances of a[i] in b; note that because
528 # b2j has no junk keys, the loop is skipped if a[i] is junk
529 j2lenget = j2len.get
530 newj2len = {}
531 for j in b2j.get(a[i], nothing):
532 # a[i] matches b[j]
533 if j < blo:
534 continue
535 if j >= bhi:
536 break
537 k = newj2len[j] = j2lenget(j-1, 0) + 1
538 if k > bestsize:
539 besti, bestj, bestsize = i-k+1, j-k+1, k
540 j2len = newj2len
541
542 # Now that we have a wholly interesting match (albeit possibly
543 # empty!), we may as well suck up the matching junk on each
544 # side of it too. Can't think of a good reason not to, and it
545 # saves post-processing the (possibly considerable) expense of
546 # figuring out what to do with it. In the case of an empty
547 # interesting match, this is clearly the right thing to do,
548 # because no other kind of match is possible in the regions.
549 while besti > alo and bestj > blo and \
550 isbjunk(b[bestj-1]) and \
551 a[besti-1] == b[bestj-1]:
552 besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
553 while besti+bestsize < ahi and bestj+bestsize < bhi and \
554 isbjunk(b[bestj+bestsize]) and \
555 a[besti+bestsize] == b[bestj+bestsize]:
556 bestsize = bestsize + 1
557
558 if TRACE:
559 print "get_matching_blocks", alo, ahi, blo, bhi
560 print " returns", besti, bestj, bestsize
561 return besti, bestj, bestsize
562
563 def get_matching_blocks(self):
564 """Return list of triples describing matching subsequences.
565
566 Each triple is of the form (i, j, n), and means that
567 a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in
568 i and in j.
569
570 The last triple is a dummy, (len(a), len(b), 0), and is the only
571 triple with n==0.
572
573 >>> s = SequenceMatcher(None, "abxcd", "abcd")
574 >>> s.get_matching_blocks()
575 [(0, 0, 2), (3, 2, 2), (5, 4, 0)]
576 """
577
578 if self.matching_blocks is not None:
579 return self.matching_blocks
580 self.matching_blocks = []
581 la, lb = len(self.a), len(self.b)
582 self.__helper(0, la, 0, lb, self.matching_blocks)
583 self.matching_blocks.append( (la, lb, 0) )
584 if TRACE:
585 print '*** matching blocks', self.matching_blocks
586 return self.matching_blocks
587
588 # builds list of matching blocks covering a[alo:ahi] and
589 # b[blo:bhi], appending them in increasing order to answer
590
591 def __helper(self, alo, ahi, blo, bhi, answer):
592 i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
593 # a[alo:i] vs b[blo:j] unknown
594 # a[i:i+k] same as b[j:j+k]
595 # a[i+k:ahi] vs b[j+k:bhi] unknown
596 if k:
597 if alo < i and blo < j:
598 self.__helper(alo, i, blo, j, answer)
599 answer.append(x)
600 if i+k < ahi and j+k < bhi:
601 self.__helper(i+k, ahi, j+k, bhi, answer)
602
603 def get_opcodes(self):
604 """Return list of 5-tuples describing how to turn a into b.
605
606 Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple
607 has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
608 tuple preceding it, and likewise for j1 == the previous j2.
609
610 The tags are strings, with these meanings:
611
612 'replace': a[i1:i2] should be replaced by b[j1:j2]
613 'delete': a[i1:i2] should be deleted.
614 Note that j1==j2 in this case.
615 'insert': b[j1:j2] should be inserted at a[i1:i1].
616 Note that i1==i2 in this case.
617 'equal': a[i1:i2] == b[j1:j2]
618
619 >>> a = "qabxcd"
620 >>> b = "abycdf"
621 >>> s = SequenceMatcher(None, a, b)
622 >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
623 ... print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
624 ... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
625 delete a[0:1] (q) b[0:0] ()
626 equal a[1:3] (ab) b[0:2] (ab)
627 replace a[3:4] (x) b[2:3] (y)
628 equal a[4:6] (cd) b[3:5] (cd)
629 insert a[6:6] () b[5:6] (f)
630 """
631
632 if self.opcodes is not None:
633 return self.opcodes
634 i = j = 0
635 self.opcodes = answer = []
636 for ai, bj, size in self.get_matching_blocks():
637 # invariant: we've pumped out correct diffs to change
638 # a[:i] into b[:j], and the next matching block is
639 # a[ai:ai+size] == b[bj:bj+size]. So we need to pump
640 # out a diff to change a[i:ai] into b[j:bj], pump out
641 # the matching block, and move (i,j) beyond the match
642 tag = ''
643 if i < ai and j < bj:
644 tag = 'replace'
645 elif i < ai:
646 tag = 'delete'
647 elif j < bj:
648 tag = 'insert'
649 if tag:
650 answer.append( (tag, i, ai, j, bj) )
651 i, j = ai+size, bj+size
652 # the list of matching blocks is terminated by a
653 # sentinel with size 0
654 if size:
655 answer.append( ('equal', ai, i, bj, j) )
656 return answer
657
658 def ratio(self):
659 """Return a measure of the sequences' similarity (float in [0,1]).
660
661 Where T is the total number of elements in both sequences, and
662 M is the number of matches, this is 2,0*M / T.
663 Note that this is 1 if the sequences are identical, and 0 if
664 they have nothing in common.
665
666 .ratio() is expensive to compute if you haven't already computed
667 .get_matching_blocks() or .get_opcodes(), in which case you may
668 want to try .quick_ratio() or .real_quick_ratio() first to get an
669 upper bound.
670
671 >>> s = SequenceMatcher(None, "abcd", "bcde")
672 >>> s.ratio()
673 0.75
674 >>> s.quick_ratio()
675 0.75
676 >>> s.real_quick_ratio()
677 1.0
678 """
679
680 matches = reduce(lambda sum, triple: sum + triple[-1],
681 self.get_matching_blocks(), 0)
682 return 2.0 * matches / (len(self.a) + len(self.b))
683
684 def quick_ratio(self):
685 """Return an upper bound on ratio() relatively quickly.
686
687 This isn't defined beyond that it is an upper bound on .ratio(), and
688 is faster to compute.
689 """
690
691 # viewing a and b as multisets, set matches to the cardinality
692 # of their intersection; this counts the number of matches
693 # without regard to order, so is clearly an upper bound
694 if self.fullbcount is None:
695 self.fullbcount = fullbcount = {}
696 for elt in self.b:
697 fullbcount[elt] = fullbcount.get(elt, 0) + 1
698 fullbcount = self.fullbcount
699 # avail[x] is the number of times x appears in 'b' less the
700 # number of times we've seen it in 'a' so far ... kinda
701 avail = {}
702 availhas, matches = avail.has_key, 0
703 for elt in self.a:
704 if availhas(elt):
705 numb = avail[elt]
706 else:
707 numb = fullbcount.get(elt, 0)
708 avail[elt] = numb - 1
709 if numb > 0:
710 matches = matches + 1
711 return 2.0 * matches / (len(self.a) + len(self.b))
712
713 def real_quick_ratio(self):
714 """Return an upper bound on ratio() very quickly.
715
716 This isn't defined beyond that it is an upper bound on .ratio(), and
717 is faster to compute than either .ratio() or .quick_ratio().
718 """
719
720 la, lb = len(self.a), len(self.b)
721 # can't have more matches than the number of elements in the
722 # shorter sequence
723 return 2.0 * min(la, lb) / (la + lb)
724
725def get_close_matches(word, possibilities, n=3, cutoff=0.6):
726 """Use SequenceMatcher to return list of the best "good enough" matches.
727
728 word is a sequence for which close matches are desired (typically a
729 string).
730
731 possibilities is a list of sequences against which to match word
732 (typically a list of strings).
733
734 Optional arg n (default 3) is the maximum number of close matches to
735 return. n must be > 0.
736
737 Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities
738 that don't score at least that similar to word are ignored.
739
740 The best (no more than n) matches among the possibilities are returned
741 in a list, sorted by similarity score, most similar first.
742
743 >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
744 ['apple', 'ape']
745 >>> import keyword
746 >>> get_close_matches("wheel", keyword.kwlist)
747 ['while']
748 >>> get_close_matches("apple", keyword.kwlist)
749 []
750 >>> get_close_matches("accept", keyword.kwlist)
751 ['except']
752 """
753
754 if not n > 0:
Fred Drakef1da6282001-02-19 19:30:05 +0000755 raise ValueError("n must be > 0: " + `n`)
Tim Peters9ae21482001-02-10 08:00:53 +0000756 if not 0.0 <= cutoff <= 1.0:
Fred Drakef1da6282001-02-19 19:30:05 +0000757 raise ValueError("cutoff must be in [0.0, 1.0]: " + `cutoff`)
Tim Peters9ae21482001-02-10 08:00:53 +0000758 result = []
759 s = SequenceMatcher()
760 s.set_seq2(word)
761 for x in possibilities:
762 s.set_seq1(x)
763 if s.real_quick_ratio() >= cutoff and \
764 s.quick_ratio() >= cutoff and \
765 s.ratio() >= cutoff:
766 result.append((s.ratio(), x))
767 # Sort by score.
768 result.sort()
769 # Retain only the best n.
770 result = result[-n:]
771 # Move best-scorer to head of list.
772 result.reverse()
773 # Strip scores.
774 return [x for score, x in result]
775
776def _test():
777 import doctest, difflib
778 return doctest.testmod(difflib)
779
780if __name__ == "__main__":
781 _test()