blob: cc9a776eb2cdd1aa43b63b7ff593748422071d02 [file] [log] [blame]
Fred Drakebaf71422001-02-19 16:31:02 +00001\section{\module{difflib} ---
2 Helpers for computing deltas}
3
4\declaremodule{standard}{difflib}
5\modulesynopsis{Helpers for computing differences between objects.}
6\moduleauthor{Tim Peters}{tim.one@home.com}
7\sectionauthor{Tim Peters}{tim.one@home.com}
8% LaTeXification by Fred L. Drake, Jr. <fdrake@acm.org>.
9
Fred Drakeda00cda2001-04-10 19:56:09 +000010\versionadded{2.1}
11
12
Fred Drake6943a292001-08-13 19:31:59 +000013\begin{classdesc*}{SequenceMatcher}
14 This is a flexible class for comparing pairs of sequences of any
15 type, so long as the sequence elements are hashable. The basic
16 algorithm predates, and is a little fancier than, an algorithm
17 published in the late 1980's by Ratcliff and Obershelp under the
18 hyperbolic name ``gestalt pattern matching.'' The idea is to find
19 the longest contiguous matching subsequence that contains no
20 ``junk'' elements (the Ratcliff and Obershelp algorithm doesn't
21 address junk). The same idea is then applied recursively to the
22 pieces of the sequences to the left and to the right of the matching
23 subsequence. This does not yield minimal edit sequences, but does
24 tend to yield matches that ``look right'' to people.
25
26 \strong{Timing:} The basic Ratcliff-Obershelp algorithm is cubic
27 time in the worst case and quadratic time in the expected case.
28 \class{SequenceMatcher} is quadratic time for the worst case and has
29 expected-case behavior dependent in a complicated way on how many
30 elements the sequences have in common; best case time is linear.
31\end{classdesc*}
32
33\begin{classdesc*}{Differ}
34 This is a class for comparing sequences of lines of text, and
35 producing human-readable differences or deltas. Differ uses
36 \class{SequenceMatcher} both to compare sequences of lines, and to
37 compare sequences of characters within similar (near-matching)
38 lines.
39
40 Each line of a \class{Differ} delta begins with a two-letter code:
41
42\begin{tableii}{l|l}{code}{Code}{Meaning}
43 \lineii{'- '}{line unique to sequence 1}
44 \lineii{'+ '}{line unique to sequence 2}
45 \lineii{' '}{line common to both sequences}
46 \lineii{'? '}{line not present in either input sequence}
47\end{tableii}
48
49 Lines beginning with `\code{?~}' attempt to guide the eye to
50 intraline differences, and were not present in either input
51 sequence. These lines can be confusing if the sequences contain tab
52 characters.
53\end{classdesc*}
54
Fred Drakebaf71422001-02-19 16:31:02 +000055\begin{funcdesc}{get_close_matches}{word, possibilities\optional{,
56 n\optional{, cutoff}}}
57 Return a list of the best ``good enough'' matches. \var{word} is a
58 sequence for which close matches are desired (typically a string),
59 and \var{possibilities} is a list of sequences against which to
60 match \var{word} (typically a list of strings).
61
62 Optional argument \var{n} (default \code{3}) is the maximum number
63 of close matches to return; \var{n} must be greater than \code{0}.
64
65 Optional argument \var{cutoff} (default \code{0.6}) is a float in
66 the range [0, 1]. Possibilities that don't score at least that
67 similar to \var{word} are ignored.
68
69 The best (no more than \var{n}) matches among the possibilities are
70 returned in a list, sorted by similarity score, most similar first.
71
72\begin{verbatim}
73>>> get_close_matches('appel', ['ape', 'apple', 'peach', 'puppy'])
74['apple', 'ape']
75>>> import keyword
76>>> get_close_matches('wheel', keyword.kwlist)
77['while']
78>>> get_close_matches('apple', keyword.kwlist)
79[]
80>>> get_close_matches('accept', keyword.kwlist)
81['except']
82\end{verbatim}
83\end{funcdesc}
84
Fred Drake6943a292001-08-13 19:31:59 +000085\begin{funcdesc}{ndiff}{a, b\optional{, linejunk\optional{,
86 charjunk}}}
87 Compare \var{a} and \var{b} (lists of strings); return a
88 \class{Differ}-style delta.
Fred Drakebaf71422001-02-19 16:31:02 +000089
Fred Drake6943a292001-08-13 19:31:59 +000090 Optional keyword parameters \var{linejunk} and \var{charjunk} are
91 for filter functions (or \code{None}):
92
93 \var{linejunk}: A function that should accept a single string
94 argument, and return true if the string is junk (or false if it is
95 not). The default is module-level function
96 \function{IS_LINE_JUNK()}, which filters out lines without visible
97 characters, except for at most one pound character (\character{\#}).
98
99 \var{charjunk}: A function that should accept a string of length 1.
100 The default is module-level function \function{IS_CHARACTER_JUNK()},
101 which filters out whitespace characters (a blank or tab; note: bad
102 idea to include newline in this!).
103
104 \file{Tools/scripts/ndiff.py} is a command-line front-end to this
105 function.
106
107\begin{verbatim}
108>>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
109... 'ore\ntree\nemu\n'.splitlines(1)))
110>>> print ''.join(diff),
111- one
112? ^
113+ ore
114? ^
115- two
116- three
117? -
118+ tree
119+ emu
120\end{verbatim}
121\end{funcdesc}
122
123\begin{funcdesc}{restore}{sequence, which}
124 Return one of the two sequences that generated a delta.
125
126 Given a \var{sequence} produced by \method{Differ.compare()} or
127 \function{ndiff()}, extract lines originating from file 1 or 2
128 (parameter \var{which}), stripping off line prefixes.
129
130 Example:
131
132\begin{verbatim}
133>>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
134... 'ore\ntree\nemu\n'.splitlines(1))
135>>> print ''.join(restore(diff, 1)),
136one
137two
138three
139>>> print ''.join(restore(diff, 2)),
140ore
141tree
142emu
143\end{verbatim}
144
145\end{funcdesc}
146
147
148\begin{funcdesc}{IS_LINE_JUNK}{line}:
149
150 Return 1 for ignorable line: iff \var{line} is blank or contains a
151 single \character{\#}. Used as a default for parameter
152 \var{linejunk} in \function{ndiff()}.
153
154\end{funcdesc}
155
156
157\begin{funcdesc}{IS_CHARACTER_JUNK}{ch}:
158
159 Return 1 for ignorable character: iff \var{ch} is a space or tab.
160 Used as a default for parameter \var{charjunk} in
161 \function{ndiff()}.
162
163\end{funcdesc}
Fred Drakebaf71422001-02-19 16:31:02 +0000164
165
Fred Drake6fda3ac2001-04-10 18:41:16 +0000166\begin{seealso}
167 \seetitle{Pattern Matching: The Gestalt Approach}{Discussion of a
168 similar algorithm by John W. Ratcliff and D. E. Metzener.
169 This was published in
170 \citetitle[http://www.ddj.com/]{Dr. Dobb's Journal} in
171 July, 1988.}
172\end{seealso}
173
174
Fred Drakebaf71422001-02-19 16:31:02 +0000175\subsection{SequenceMatcher Objects \label{sequence-matcher}}
176
Fred Drake96d7a702001-05-11 01:08:13 +0000177The \class{SequenceMatcher} class has this constructor:
178
Fred Drakebaf71422001-02-19 16:31:02 +0000179\begin{classdesc}{SequenceMatcher}{\optional{isjunk\optional{,
180 a\optional{, b}}}}
181 Optional argument \var{isjunk} must be \code{None} (the default) or
182 a one-argument function that takes a sequence element and returns
183 true if and only if the element is ``junk'' and should be ignored.
184 \code{None} is equivalent to passing \code{lambda x: 0}, i.e.\ no
185 elements are ignored. For example, pass
186
187\begin{verbatim}
Fred Drake447f5452001-02-23 19:13:07 +0000188lambda x: x in " \t"
Fred Drakebaf71422001-02-19 16:31:02 +0000189\end{verbatim}
190
191 if you're comparing lines as sequences of characters, and don't want
192 to synch up on blanks or hard tabs.
193
194 The optional arguments \var{a} and \var{b} are sequences to be
195 compared; both default to empty strings. The elements of both
196 sequences must be hashable.
197\end{classdesc}
198
199
200\class{SequenceMatcher} objects have the following methods:
201
202\begin{methoddesc}{set_seqs}{a, b}
203 Set the two sequences to be compared.
204\end{methoddesc}
205
206\class{SequenceMatcher} computes and caches detailed information about
207the second sequence, so if you want to compare one sequence against
208many sequences, use \method{set_seq2()} to set the commonly used
209sequence once and call \method{set_seq1()} repeatedly, once for each
210of the other sequences.
211
212\begin{methoddesc}{set_seq1}{a}
213 Set the first sequence to be compared. The second sequence to be
214 compared is not changed.
215\end{methoddesc}
216
217\begin{methoddesc}{set_seq2}{b}
218 Set the second sequence to be compared. The first sequence to be
219 compared is not changed.
220\end{methoddesc}
221
222\begin{methoddesc}{find_longest_match}{alo, ahi, blo, bhi}
223 Find longest matching block in \code{\var{a}[\var{alo}:\var{ahi}]}
224 and \code{\var{b}[\var{blo}:\var{bhi}]}.
225
226 If \var{isjunk} was omitted or \code{None},
227 \method{get_longest_match()} returns \code{(\var{i}, \var{j},
228 \var{k})} such that \code{\var{a}[\var{i}:\var{i}+\var{k}]} is equal
229 to \code{\var{b}[\var{j}:\var{j}+\var{k}]}, where
230 \code{\var{alo} <= \var{i} <= \var{i}+\var{k} <= \var{ahi}} and
231 \code{\var{blo} <= \var{j} <= \var{j}+\var{k} <= \var{bhi}}.
232 For all \code{(\var{i'}, \var{j'}, \var{k'})} meeting those
233 conditions, the additional conditions
234 \code{\var{k} >= \var{k'}},
235 \code{\var{i} <= \var{i'}},
236 and if \code{\var{i} == \var{i'}}, \code{\var{j} <= \var{j'}}
237 are also met.
238 In other words, of all maximal matching blocks, return one that
239 starts earliest in \var{a}, and of all those maximal matching blocks
240 that start earliest in \var{a}, return the one that starts earliest
241 in \var{b}.
242
243\begin{verbatim}
244>>> s = SequenceMatcher(None, " abcd", "abcd abcd")
245>>> s.find_longest_match(0, 5, 0, 9)
246(0, 4, 5)
247\end{verbatim}
248
249 If \var{isjunk} was provided, first the longest matching block is
250 determined as above, but with the additional restriction that no
251 junk element appears in the block. Then that block is extended as
252 far as possible by matching (only) junk elements on both sides.
253 So the resulting block never matches on junk except as identical
254 junk happens to be adjacent to an interesting match.
255
256 Here's the same example as before, but considering blanks to be junk.
Tim Peters754ba582001-02-20 11:24:35 +0000257 That prevents \code{' abcd'} from matching the \code{' abcd'} at the
Fred Drakebaf71422001-02-19 16:31:02 +0000258 tail end of the second sequence directly. Instead only the
259 \code{'abcd'} can match, and matches the leftmost \code{'abcd'} in
260 the second sequence:
261
262\begin{verbatim}
263>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
264>>> s.find_longest_match(0, 5, 0, 9)
265(1, 0, 4)
266\end{verbatim}
267
268 If no blocks match, this returns \code{(\var{alo}, \var{blo}, 0)}.
269\end{methoddesc}
270
271\begin{methoddesc}{get_matching_blocks}{}
272 Return list of triples describing matching subsequences.
273 Each triple is of the form \code{(\var{i}, \var{j}, \var{n})}, and
274 means that \code{\var{a}[\var{i}:\var{i}+\var{n}] ==
275 \var{b}[\var{j}:\var{j}+\var{n}]}. The triples are monotonically
276 increasing in \var{i} and \var{j}.
277
278 The last triple is a dummy, and has the value \code{(len(\var{a}),
279 len(\var{b}), 0)}. It is the only triple with \code{\var{n} == 0}.
280 % Explain why a dummy is used!
281
282\begin{verbatim}
283>>> s = SequenceMatcher(None, "abxcd", "abcd")
284>>> s.get_matching_blocks()
285[(0, 0, 2), (3, 2, 2), (5, 4, 0)]
286\end{verbatim}
287\end{methoddesc}
288
289\begin{methoddesc}{get_opcodes}{}
290 Return list of 5-tuples describing how to turn \var{a} into \var{b}.
291 Each tuple is of the form \code{(\var{tag}, \var{i1}, \var{i2},
292 \var{j1}, \var{j2})}. The first tuple has \code{\var{i1} ==
293 \var{j1} == 0}, and remaining tuples have \var{i1} equal to the
294 \var{i2} from the preceeding tuple, and, likewise, \var{j1} equal to
295 the previous \var{j2}.
296
297 The \var{tag} values are strings, with these meanings:
298
299\begin{tableii}{l|l}{code}{Value}{Meaning}
300 \lineii{'replace'}{\code{\var{a}[\var{i1}:\var{i2}]} should be
301 replaced by \code{\var{b}[\var{j1}:\var{j2}]}.}
302 \lineii{'delete'}{\code{\var{a}[\var{i1}:\var{i2}]} should be
303 deleted. Note that \code{\var{j1} == \var{j2}} in
304 this case.}
305 \lineii{'insert'}{\code{\var{b}[\var{j1}:\var{j2}]} should be
306 inserted at \code{\var{a}[\var{i1}:\var{i1}]}.
307 Note that \code{\var{i1} == \var{i2}} in this
308 case.}
309 \lineii{'equal'}{\code{\var{a}[\var{i1}:\var{i2}] ==
310 \var{b}[\var{j1}:\var{j2}]} (the sub-sequences are
311 equal).}
312\end{tableii}
313
314For example:
315
316\begin{verbatim}
317>>> a = "qabxcd"
318>>> b = "abycdf"
319>>> s = SequenceMatcher(None, a, b)
320>>> for tag, i1, i2, j1, j2 in s.get_opcodes():
321... print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
322... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
323 delete a[0:1] (q) b[0:0] ()
324 equal a[1:3] (ab) b[0:2] (ab)
325replace a[3:4] (x) b[2:3] (y)
326 equal a[4:6] (cd) b[3:5] (cd)
327 insert a[6:6] () b[5:6] (f)
328\end{verbatim}
329\end{methoddesc}
330
331\begin{methoddesc}{ratio}{}
332 Return a measure of the sequences' similarity as a float in the
333 range [0, 1].
334
335 Where T is the total number of elements in both sequences, and M is
Fred Drake6943a292001-08-13 19:31:59 +0000336 the number of matches, this is 2.0*M / T. Note that this is
337 \code{1.0} if the sequences are identical, and \code{0.0} if they
338 have nothing in common.
Fred Drakebaf71422001-02-19 16:31:02 +0000339
340 This is expensive to compute if \method{get_matching_blocks()} or
341 \method{get_opcodes()} hasn't already been called, in which case you
342 may want to try \method{quick_ratio()} or
343 \method{real_quick_ratio()} first to get an upper bound.
344\end{methoddesc}
345
346\begin{methoddesc}{quick_ratio}{}
347 Return an upper bound on \method{ratio()} relatively quickly.
348
349 This isn't defined beyond that it is an upper bound on
350 \method{ratio()}, and is faster to compute.
351\end{methoddesc}
352
353\begin{methoddesc}{real_quick_ratio}{}
354 Return an upper bound on \method{ratio()} very quickly.
355
356 This isn't defined beyond that it is an upper bound on
357 \method{ratio()}, and is faster to compute than either
358 \method{ratio()} or \method{quick_ratio()}.
359\end{methoddesc}
360
Tim Peters754ba582001-02-20 11:24:35 +0000361The three methods that return the ratio of matching to total characters
362can give different results due to differing levels of approximation,
363although \method{quick_ratio()} and \method{real_quick_ratio()} are always
364at least as large as \method{ratio()}:
Fred Drakebaf71422001-02-19 16:31:02 +0000365
366\begin{verbatim}
367>>> s = SequenceMatcher(None, "abcd", "bcde")
368>>> s.ratio()
3690.75
370>>> s.quick_ratio()
3710.75
372>>> s.real_quick_ratio()
3731.0
374\end{verbatim}
375
376
Fred Drake6943a292001-08-13 19:31:59 +0000377\subsection{SequenceMatcher Examples \label{sequencematcher-examples}}
Fred Drakebaf71422001-02-19 16:31:02 +0000378
379
380This example compares two strings, considering blanks to be ``junk:''
381
382\begin{verbatim}
383>>> s = SequenceMatcher(lambda x: x == " ",
384... "private Thread currentThread;",
385... "private volatile Thread currentThread;")
386\end{verbatim}
387
388\method{ratio()} returns a float in [0, 1], measuring the similarity
389of the sequences. As a rule of thumb, a \method{ratio()} value over
3900.6 means the sequences are close matches:
391
392\begin{verbatim}
393>>> print round(s.ratio(), 3)
3940.866
395\end{verbatim}
396
397If you're only interested in where the sequences match,
398\method{get_matching_blocks()} is handy:
399
400\begin{verbatim}
401>>> for block in s.get_matching_blocks():
402... print "a[%d] and b[%d] match for %d elements" % block
403a[0] and b[0] match for 8 elements
404a[8] and b[17] match for 6 elements
405a[14] and b[23] match for 15 elements
406a[29] and b[38] match for 0 elements
407\end{verbatim}
408
409Note that the last tuple returned by \method{get_matching_blocks()} is
410always a dummy, \code{(len(\var{a}), len(\var{b}), 0)}, and this is
411the only case in which the last tuple element (number of elements
412matched) is \code{0}.
413
414If you want to know how to change the first sequence into the second,
415use \method{get_opcodes()}:
416
417\begin{verbatim}
418>>> for opcode in s.get_opcodes():
419... print "%6s a[%d:%d] b[%d:%d]" % opcode
420 equal a[0:8] b[0:8]
421insert a[8:8] b[8:17]
422 equal a[8:14] b[17:23]
423 equal a[14:29] b[23:38]
424\end{verbatim}
425
Fred Drakebaf71422001-02-19 16:31:02 +0000426See also the function \function{get_close_matches()} in this module,
427which shows how simple code building on \class{SequenceMatcher} can be
428used to do useful work.
Fred Drake6943a292001-08-13 19:31:59 +0000429
430
431\subsection{Differ Objects \label{differ-objects}}
432
433Note that \class{Differ}-generated deltas make no claim to be
434\strong{minimal} diffs. To the contrary, minimal diffs are often
435counter-intuitive, because they synch up anywhere possible, sometimes
436accidental matches 100 pages apart. Restricting synch points to
437contiguous matches preserves some notion of locality, at the
438occasional cost of producing a longer diff.
439
440The \class{Differ} class has this constructor:
441
442\begin{classdesc}{Differ}{\optional{linejunk\optional{, charjunk}}}
443 Optional keyword parameters \var{linejunk} and \var{charjunk} are
444 for filter functions (or \code{None}):
445
446 \var{linejunk}: A function that should accept a single string
447 argument, and return true iff the string is junk. The default is
448 module-level function \function{IS_LINE_JUNK()}, which filters out
449 lines without visible characters, except for at most one pound
450 character (\character{\#}).
451
452 \var{charjunk}: A function that should accept a string of length 1.
453 The default is module-level function \function{IS_CHARACTER_JUNK()},
454 which filters out whitespace characters (a blank or tab; note: bad
455 idea to include newline in this!).
456\end{classdesc}
457
458\class{Differ} objects are used (deltas generated) via a single
459method:
460
461\begin{methoddesc}{compare}{a, b}
462 Compare two sequences of lines; return the resulting delta (list).
463
464 Each sequence must contain individual single-line strings ending
465 with newlines. Such sequences can be obtained from the
466 \method{readlines()} method of file-like objects. The list returned
467 is also made up of newline-terminated strings, and ready to be used
468 with the \method{writelines()} method of a file-like object.
469\end{methoddesc}
470
471
472\subsection{Differ Example \label{differ-examples}}
473
474This example compares two texts. First we set up the texts, sequences
475of individual single-line strings ending with newlines (such sequences
476can also be obtained from the \method{readlines()} method of file-like
477objects):
478
479\begin{verbatim}
480>>> text1 = ''' 1. Beautiful is better than ugly.
481... 2. Explicit is better than implicit.
482... 3. Simple is better than complex.
483... 4. Complex is better than complicated.
484... '''.splitlines(1)
485>>> len(text1)
4864
487>>> text1[0][-1]
488'\n'
489>>> text2 = ''' 1. Beautiful is better than ugly.
490... 3. Simple is better than complex.
491... 4. Complicated is better than complex.
492... 5. Flat is better than nested.
493... '''.splitlines(1)
494\end{verbatim}
495
496Next we instantiate a Differ object:
497
498\begin{verbatim}
499>>> d = Differ()
500\end{verbatim}
501
502Note that when instantiating a \class{Differ} object we may pass
503functions to filter out line and character ``junk.'' See the
504\method{Differ()} constructor for details.
505
506Finally, we compare the two:
507
508\begin{verbatim}
509>>> result = d.compare(text1, text2)
510\end{verbatim}
511
512\code{result} is a list of strings, so let's pretty-print it:
513
514\begin{verbatim}
515>>> from pprint import pprint
516>>> pprint(result)
517[' 1. Beautiful is better than ugly.\n',
518 '- 2. Explicit is better than implicit.\n',
519 '- 3. Simple is better than complex.\n',
520 '+ 3. Simple is better than complex.\n',
521 '? ++ \n',
522 '- 4. Complex is better than complicated.\n',
523 '? ^ ---- ^ \n',
524 '+ 4. Complicated is better than complex.\n',
525 '? ++++ ^ ^ \n',
526 '+ 5. Flat is better than nested.\n']
527\end{verbatim}
528
529As a single multi-line string it looks like this:
530
531\begin{verbatim}
532>>> import sys
533>>> sys.stdout.writelines(result)
534 1. Beautiful is better than ugly.
535- 2. Explicit is better than implicit.
536- 3. Simple is better than complex.
537+ 3. Simple is better than complex.
538? ++
539- 4. Complex is better than complicated.
540? ^ ---- ^
541+ 4. Complicated is better than complex.
542? ++++ ^ ^
543+ 5. Flat is better than nested.
544\end{verbatim}