blob: a66943526a00a69ff646080ceb6dfc13e302faca [file] [log] [blame]
Fred Drakebaf71422001-02-19 16:31:02 +00001\section{\module{difflib} ---
2 Helpers for computing deltas}
3
4\declaremodule{standard}{difflib}
5\modulesynopsis{Helpers for computing differences between objects.}
6\moduleauthor{Tim Peters}{tim.one@home.com}
7\sectionauthor{Tim Peters}{tim.one@home.com}
8% LaTeXification by Fred L. Drake, Jr. <fdrake@acm.org>.
9
Fred Drakeda00cda2001-04-10 19:56:09 +000010\versionadded{2.1}
11
12
Fred Drake6943a292001-08-13 19:31:59 +000013\begin{classdesc*}{SequenceMatcher}
14 This is a flexible class for comparing pairs of sequences of any
15 type, so long as the sequence elements are hashable. The basic
16 algorithm predates, and is a little fancier than, an algorithm
17 published in the late 1980's by Ratcliff and Obershelp under the
18 hyperbolic name ``gestalt pattern matching.'' The idea is to find
19 the longest contiguous matching subsequence that contains no
20 ``junk'' elements (the Ratcliff and Obershelp algorithm doesn't
21 address junk). The same idea is then applied recursively to the
22 pieces of the sequences to the left and to the right of the matching
23 subsequence. This does not yield minimal edit sequences, but does
24 tend to yield matches that ``look right'' to people.
25
26 \strong{Timing:} The basic Ratcliff-Obershelp algorithm is cubic
27 time in the worst case and quadratic time in the expected case.
28 \class{SequenceMatcher} is quadratic time for the worst case and has
29 expected-case behavior dependent in a complicated way on how many
30 elements the sequences have in common; best case time is linear.
31\end{classdesc*}
32
33\begin{classdesc*}{Differ}
34 This is a class for comparing sequences of lines of text, and
Tim Peters8a9c2842001-09-22 21:30:22 +000035 producing human-readable differences or deltas. Differ uses
Fred Drake6943a292001-08-13 19:31:59 +000036 \class{SequenceMatcher} both to compare sequences of lines, and to
37 compare sequences of characters within similar (near-matching)
38 lines.
39
40 Each line of a \class{Differ} delta begins with a two-letter code:
41
42\begin{tableii}{l|l}{code}{Code}{Meaning}
43 \lineii{'- '}{line unique to sequence 1}
44 \lineii{'+ '}{line unique to sequence 2}
45 \lineii{' '}{line common to both sequences}
46 \lineii{'? '}{line not present in either input sequence}
47\end{tableii}
48
49 Lines beginning with `\code{?~}' attempt to guide the eye to
50 intraline differences, and were not present in either input
51 sequence. These lines can be confusing if the sequences contain tab
52 characters.
53\end{classdesc*}
54
Fred Drakebaf71422001-02-19 16:31:02 +000055\begin{funcdesc}{get_close_matches}{word, possibilities\optional{,
56 n\optional{, cutoff}}}
57 Return a list of the best ``good enough'' matches. \var{word} is a
58 sequence for which close matches are desired (typically a string),
59 and \var{possibilities} is a list of sequences against which to
60 match \var{word} (typically a list of strings).
61
62 Optional argument \var{n} (default \code{3}) is the maximum number
63 of close matches to return; \var{n} must be greater than \code{0}.
64
65 Optional argument \var{cutoff} (default \code{0.6}) is a float in
66 the range [0, 1]. Possibilities that don't score at least that
67 similar to \var{word} are ignored.
68
69 The best (no more than \var{n}) matches among the possibilities are
70 returned in a list, sorted by similarity score, most similar first.
71
72\begin{verbatim}
73>>> get_close_matches('appel', ['ape', 'apple', 'peach', 'puppy'])
74['apple', 'ape']
75>>> import keyword
76>>> get_close_matches('wheel', keyword.kwlist)
77['while']
78>>> get_close_matches('apple', keyword.kwlist)
79[]
80>>> get_close_matches('accept', keyword.kwlist)
81['except']
82\end{verbatim}
83\end{funcdesc}
84
Fred Drake6943a292001-08-13 19:31:59 +000085\begin{funcdesc}{ndiff}{a, b\optional{, linejunk\optional{,
86 charjunk}}}
87 Compare \var{a} and \var{b} (lists of strings); return a
Tim Peters8a9c2842001-09-22 21:30:22 +000088 \class{Differ}-style delta (a generator generating the delta lines).
Fred Drakebaf71422001-02-19 16:31:02 +000089
Fred Drake6943a292001-08-13 19:31:59 +000090 Optional keyword parameters \var{linejunk} and \var{charjunk} are
91 for filter functions (or \code{None}):
92
93 \var{linejunk}: A function that should accept a single string
94 argument, and return true if the string is junk (or false if it is
95 not). The default is module-level function
96 \function{IS_LINE_JUNK()}, which filters out lines without visible
97 characters, except for at most one pound character (\character{\#}).
98
99 \var{charjunk}: A function that should accept a string of length 1.
100 The default is module-level function \function{IS_CHARACTER_JUNK()},
101 which filters out whitespace characters (a blank or tab; note: bad
102 idea to include newline in this!).
103
104 \file{Tools/scripts/ndiff.py} is a command-line front-end to this
105 function.
106
107\begin{verbatim}
108>>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
109... 'ore\ntree\nemu\n'.splitlines(1)))
110>>> print ''.join(diff),
111- one
Tim Peters8a9c2842001-09-22 21:30:22 +0000112? ^
Fred Drake6943a292001-08-13 19:31:59 +0000113+ ore
Tim Peters8a9c2842001-09-22 21:30:22 +0000114? ^
Fred Drake6943a292001-08-13 19:31:59 +0000115- two
116- three
Tim Peters8a9c2842001-09-22 21:30:22 +0000117? -
Fred Drake6943a292001-08-13 19:31:59 +0000118+ tree
119+ emu
120\end{verbatim}
121\end{funcdesc}
122
123\begin{funcdesc}{restore}{sequence, which}
124 Return one of the two sequences that generated a delta.
125
126 Given a \var{sequence} produced by \method{Differ.compare()} or
127 \function{ndiff()}, extract lines originating from file 1 or 2
128 (parameter \var{which}), stripping off line prefixes.
129
130 Example:
131
132\begin{verbatim}
133>>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
134... 'ore\ntree\nemu\n'.splitlines(1))
Tim Peters8a9c2842001-09-22 21:30:22 +0000135>>> diff = list(diff) # materialize the generated delta into a list
Fred Drake6943a292001-08-13 19:31:59 +0000136>>> print ''.join(restore(diff, 1)),
137one
138two
139three
140>>> print ''.join(restore(diff, 2)),
141ore
142tree
143emu
144\end{verbatim}
145
146\end{funcdesc}
147
148
149\begin{funcdesc}{IS_LINE_JUNK}{line}:
150
151 Return 1 for ignorable line: iff \var{line} is blank or contains a
152 single \character{\#}. Used as a default for parameter
153 \var{linejunk} in \function{ndiff()}.
154
155\end{funcdesc}
156
157
158\begin{funcdesc}{IS_CHARACTER_JUNK}{ch}:
159
160 Return 1 for ignorable character: iff \var{ch} is a space or tab.
161 Used as a default for parameter \var{charjunk} in
162 \function{ndiff()}.
163
164\end{funcdesc}
Fred Drakebaf71422001-02-19 16:31:02 +0000165
166
Fred Drake6fda3ac2001-04-10 18:41:16 +0000167\begin{seealso}
168 \seetitle{Pattern Matching: The Gestalt Approach}{Discussion of a
169 similar algorithm by John W. Ratcliff and D. E. Metzener.
170 This was published in
171 \citetitle[http://www.ddj.com/]{Dr. Dobb's Journal} in
172 July, 1988.}
173\end{seealso}
174
175
Fred Drakebaf71422001-02-19 16:31:02 +0000176\subsection{SequenceMatcher Objects \label{sequence-matcher}}
177
Fred Drake96d7a702001-05-11 01:08:13 +0000178The \class{SequenceMatcher} class has this constructor:
179
Fred Drakebaf71422001-02-19 16:31:02 +0000180\begin{classdesc}{SequenceMatcher}{\optional{isjunk\optional{,
181 a\optional{, b}}}}
182 Optional argument \var{isjunk} must be \code{None} (the default) or
183 a one-argument function that takes a sequence element and returns
184 true if and only if the element is ``junk'' and should be ignored.
185 \code{None} is equivalent to passing \code{lambda x: 0}, i.e.\ no
186 elements are ignored. For example, pass
187
188\begin{verbatim}
Fred Drake447f5452001-02-23 19:13:07 +0000189lambda x: x in " \t"
Fred Drakebaf71422001-02-19 16:31:02 +0000190\end{verbatim}
191
192 if you're comparing lines as sequences of characters, and don't want
193 to synch up on blanks or hard tabs.
194
195 The optional arguments \var{a} and \var{b} are sequences to be
196 compared; both default to empty strings. The elements of both
197 sequences must be hashable.
198\end{classdesc}
199
200
201\class{SequenceMatcher} objects have the following methods:
202
203\begin{methoddesc}{set_seqs}{a, b}
204 Set the two sequences to be compared.
205\end{methoddesc}
206
207\class{SequenceMatcher} computes and caches detailed information about
208the second sequence, so if you want to compare one sequence against
209many sequences, use \method{set_seq2()} to set the commonly used
210sequence once and call \method{set_seq1()} repeatedly, once for each
211of the other sequences.
212
213\begin{methoddesc}{set_seq1}{a}
214 Set the first sequence to be compared. The second sequence to be
215 compared is not changed.
216\end{methoddesc}
217
218\begin{methoddesc}{set_seq2}{b}
219 Set the second sequence to be compared. The first sequence to be
220 compared is not changed.
221\end{methoddesc}
222
223\begin{methoddesc}{find_longest_match}{alo, ahi, blo, bhi}
224 Find longest matching block in \code{\var{a}[\var{alo}:\var{ahi}]}
225 and \code{\var{b}[\var{blo}:\var{bhi}]}.
226
227 If \var{isjunk} was omitted or \code{None},
228 \method{get_longest_match()} returns \code{(\var{i}, \var{j},
229 \var{k})} such that \code{\var{a}[\var{i}:\var{i}+\var{k}]} is equal
Tim Peters8a9c2842001-09-22 21:30:22 +0000230 to \code{\var{b}[\var{j}:\var{j}+\var{k}]}, where
Fred Drakebaf71422001-02-19 16:31:02 +0000231 \code{\var{alo} <= \var{i} <= \var{i}+\var{k} <= \var{ahi}} and
232 \code{\var{blo} <= \var{j} <= \var{j}+\var{k} <= \var{bhi}}.
233 For all \code{(\var{i'}, \var{j'}, \var{k'})} meeting those
234 conditions, the additional conditions
235 \code{\var{k} >= \var{k'}},
236 \code{\var{i} <= \var{i'}},
237 and if \code{\var{i} == \var{i'}}, \code{\var{j} <= \var{j'}}
238 are also met.
239 In other words, of all maximal matching blocks, return one that
240 starts earliest in \var{a}, and of all those maximal matching blocks
241 that start earliest in \var{a}, return the one that starts earliest
242 in \var{b}.
243
244\begin{verbatim}
245>>> s = SequenceMatcher(None, " abcd", "abcd abcd")
246>>> s.find_longest_match(0, 5, 0, 9)
247(0, 4, 5)
248\end{verbatim}
249
250 If \var{isjunk} was provided, first the longest matching block is
251 determined as above, but with the additional restriction that no
252 junk element appears in the block. Then that block is extended as
253 far as possible by matching (only) junk elements on both sides.
254 So the resulting block never matches on junk except as identical
255 junk happens to be adjacent to an interesting match.
256
257 Here's the same example as before, but considering blanks to be junk.
Tim Peters754ba582001-02-20 11:24:35 +0000258 That prevents \code{' abcd'} from matching the \code{' abcd'} at the
Fred Drakebaf71422001-02-19 16:31:02 +0000259 tail end of the second sequence directly. Instead only the
260 \code{'abcd'} can match, and matches the leftmost \code{'abcd'} in
261 the second sequence:
262
263\begin{verbatim}
264>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
265>>> s.find_longest_match(0, 5, 0, 9)
266(1, 0, 4)
267\end{verbatim}
268
269 If no blocks match, this returns \code{(\var{alo}, \var{blo}, 0)}.
270\end{methoddesc}
271
272\begin{methoddesc}{get_matching_blocks}{}
273 Return list of triples describing matching subsequences.
274 Each triple is of the form \code{(\var{i}, \var{j}, \var{n})}, and
275 means that \code{\var{a}[\var{i}:\var{i}+\var{n}] ==
276 \var{b}[\var{j}:\var{j}+\var{n}]}. The triples are monotonically
277 increasing in \var{i} and \var{j}.
278
279 The last triple is a dummy, and has the value \code{(len(\var{a}),
280 len(\var{b}), 0)}. It is the only triple with \code{\var{n} == 0}.
281 % Explain why a dummy is used!
282
283\begin{verbatim}
284>>> s = SequenceMatcher(None, "abxcd", "abcd")
285>>> s.get_matching_blocks()
286[(0, 0, 2), (3, 2, 2), (5, 4, 0)]
287\end{verbatim}
288\end{methoddesc}
289
290\begin{methoddesc}{get_opcodes}{}
291 Return list of 5-tuples describing how to turn \var{a} into \var{b}.
292 Each tuple is of the form \code{(\var{tag}, \var{i1}, \var{i2},
293 \var{j1}, \var{j2})}. The first tuple has \code{\var{i1} ==
294 \var{j1} == 0}, and remaining tuples have \var{i1} equal to the
295 \var{i2} from the preceeding tuple, and, likewise, \var{j1} equal to
296 the previous \var{j2}.
297
298 The \var{tag} values are strings, with these meanings:
299
300\begin{tableii}{l|l}{code}{Value}{Meaning}
301 \lineii{'replace'}{\code{\var{a}[\var{i1}:\var{i2}]} should be
302 replaced by \code{\var{b}[\var{j1}:\var{j2}]}.}
303 \lineii{'delete'}{\code{\var{a}[\var{i1}:\var{i2}]} should be
304 deleted. Note that \code{\var{j1} == \var{j2}} in
305 this case.}
306 \lineii{'insert'}{\code{\var{b}[\var{j1}:\var{j2}]} should be
Tim Peters8a9c2842001-09-22 21:30:22 +0000307 inserted at \code{\var{a}[\var{i1}:\var{i1}]}.
Fred Drakebaf71422001-02-19 16:31:02 +0000308 Note that \code{\var{i1} == \var{i2}} in this
309 case.}
310 \lineii{'equal'}{\code{\var{a}[\var{i1}:\var{i2}] ==
311 \var{b}[\var{j1}:\var{j2}]} (the sub-sequences are
312 equal).}
313\end{tableii}
314
315For example:
316
317\begin{verbatim}
318>>> a = "qabxcd"
319>>> b = "abycdf"
320>>> s = SequenceMatcher(None, a, b)
321>>> for tag, i1, i2, j1, j2 in s.get_opcodes():
322... print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
323... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
324 delete a[0:1] (q) b[0:0] ()
325 equal a[1:3] (ab) b[0:2] (ab)
326replace a[3:4] (x) b[2:3] (y)
327 equal a[4:6] (cd) b[3:5] (cd)
328 insert a[6:6] () b[5:6] (f)
329\end{verbatim}
330\end{methoddesc}
331
332\begin{methoddesc}{ratio}{}
333 Return a measure of the sequences' similarity as a float in the
334 range [0, 1].
335
336 Where T is the total number of elements in both sequences, and M is
Fred Drake6943a292001-08-13 19:31:59 +0000337 the number of matches, this is 2.0*M / T. Note that this is
338 \code{1.0} if the sequences are identical, and \code{0.0} if they
339 have nothing in common.
Fred Drakebaf71422001-02-19 16:31:02 +0000340
341 This is expensive to compute if \method{get_matching_blocks()} or
342 \method{get_opcodes()} hasn't already been called, in which case you
343 may want to try \method{quick_ratio()} or
344 \method{real_quick_ratio()} first to get an upper bound.
345\end{methoddesc}
346
347\begin{methoddesc}{quick_ratio}{}
348 Return an upper bound on \method{ratio()} relatively quickly.
349
350 This isn't defined beyond that it is an upper bound on
351 \method{ratio()}, and is faster to compute.
352\end{methoddesc}
353
354\begin{methoddesc}{real_quick_ratio}{}
355 Return an upper bound on \method{ratio()} very quickly.
356
357 This isn't defined beyond that it is an upper bound on
358 \method{ratio()}, and is faster to compute than either
359 \method{ratio()} or \method{quick_ratio()}.
360\end{methoddesc}
361
Tim Peters754ba582001-02-20 11:24:35 +0000362The three methods that return the ratio of matching to total characters
363can give different results due to differing levels of approximation,
364although \method{quick_ratio()} and \method{real_quick_ratio()} are always
365at least as large as \method{ratio()}:
Fred Drakebaf71422001-02-19 16:31:02 +0000366
367\begin{verbatim}
368>>> s = SequenceMatcher(None, "abcd", "bcde")
369>>> s.ratio()
3700.75
371>>> s.quick_ratio()
3720.75
373>>> s.real_quick_ratio()
3741.0
375\end{verbatim}
376
377
Fred Drake6943a292001-08-13 19:31:59 +0000378\subsection{SequenceMatcher Examples \label{sequencematcher-examples}}
Fred Drakebaf71422001-02-19 16:31:02 +0000379
380
381This example compares two strings, considering blanks to be ``junk:''
382
383\begin{verbatim}
384>>> s = SequenceMatcher(lambda x: x == " ",
385... "private Thread currentThread;",
386... "private volatile Thread currentThread;")
387\end{verbatim}
388
389\method{ratio()} returns a float in [0, 1], measuring the similarity
390of the sequences. As a rule of thumb, a \method{ratio()} value over
3910.6 means the sequences are close matches:
392
393\begin{verbatim}
394>>> print round(s.ratio(), 3)
3950.866
396\end{verbatim}
397
398If you're only interested in where the sequences match,
399\method{get_matching_blocks()} is handy:
400
401\begin{verbatim}
402>>> for block in s.get_matching_blocks():
403... print "a[%d] and b[%d] match for %d elements" % block
404a[0] and b[0] match for 8 elements
405a[8] and b[17] match for 6 elements
406a[14] and b[23] match for 15 elements
407a[29] and b[38] match for 0 elements
408\end{verbatim}
409
410Note that the last tuple returned by \method{get_matching_blocks()} is
411always a dummy, \code{(len(\var{a}), len(\var{b}), 0)}, and this is
412the only case in which the last tuple element (number of elements
413matched) is \code{0}.
414
415If you want to know how to change the first sequence into the second,
416use \method{get_opcodes()}:
417
418\begin{verbatim}
419>>> for opcode in s.get_opcodes():
420... print "%6s a[%d:%d] b[%d:%d]" % opcode
421 equal a[0:8] b[0:8]
422insert a[8:8] b[8:17]
423 equal a[8:14] b[17:23]
424 equal a[14:29] b[23:38]
425\end{verbatim}
426
Fred Drakebaf71422001-02-19 16:31:02 +0000427See also the function \function{get_close_matches()} in this module,
428which shows how simple code building on \class{SequenceMatcher} can be
429used to do useful work.
Fred Drake6943a292001-08-13 19:31:59 +0000430
431
432\subsection{Differ Objects \label{differ-objects}}
433
434Note that \class{Differ}-generated deltas make no claim to be
435\strong{minimal} diffs. To the contrary, minimal diffs are often
436counter-intuitive, because they synch up anywhere possible, sometimes
437accidental matches 100 pages apart. Restricting synch points to
438contiguous matches preserves some notion of locality, at the
439occasional cost of producing a longer diff.
440
441The \class{Differ} class has this constructor:
442
443\begin{classdesc}{Differ}{\optional{linejunk\optional{, charjunk}}}
444 Optional keyword parameters \var{linejunk} and \var{charjunk} are
445 for filter functions (or \code{None}):
446
447 \var{linejunk}: A function that should accept a single string
448 argument, and return true iff the string is junk. The default is
449 module-level function \function{IS_LINE_JUNK()}, which filters out
450 lines without visible characters, except for at most one pound
451 character (\character{\#}).
452
453 \var{charjunk}: A function that should accept a string of length 1.
454 The default is module-level function \function{IS_CHARACTER_JUNK()},
455 which filters out whitespace characters (a blank or tab; note: bad
456 idea to include newline in this!).
457\end{classdesc}
458
459\class{Differ} objects are used (deltas generated) via a single
460method:
461
462\begin{methoddesc}{compare}{a, b}
Tim Peters8a9c2842001-09-22 21:30:22 +0000463 Compare two sequences of lines, and generate the delta (a sequence
464 of lines).
Fred Drake6943a292001-08-13 19:31:59 +0000465
466 Each sequence must contain individual single-line strings ending
467 with newlines. Such sequences can be obtained from the
Tim Peters8a9c2842001-09-22 21:30:22 +0000468 \method{readlines()} method of file-like objects. The delta generated
469 also consists of newline-terminated strings, ready to be printed as-is
470 via the \method{writeline()} method of a file-like object.
Fred Drake6943a292001-08-13 19:31:59 +0000471\end{methoddesc}
472
473
474\subsection{Differ Example \label{differ-examples}}
475
476This example compares two texts. First we set up the texts, sequences
477of individual single-line strings ending with newlines (such sequences
478can also be obtained from the \method{readlines()} method of file-like
479objects):
480
481\begin{verbatim}
482>>> text1 = ''' 1. Beautiful is better than ugly.
483... 2. Explicit is better than implicit.
484... 3. Simple is better than complex.
485... 4. Complex is better than complicated.
486... '''.splitlines(1)
487>>> len(text1)
4884
489>>> text1[0][-1]
490'\n'
491>>> text2 = ''' 1. Beautiful is better than ugly.
492... 3. Simple is better than complex.
493... 4. Complicated is better than complex.
494... 5. Flat is better than nested.
495... '''.splitlines(1)
496\end{verbatim}
497
498Next we instantiate a Differ object:
499
500\begin{verbatim}
501>>> d = Differ()
502\end{verbatim}
503
504Note that when instantiating a \class{Differ} object we may pass
505functions to filter out line and character ``junk.'' See the
506\method{Differ()} constructor for details.
507
508Finally, we compare the two:
509
510\begin{verbatim}
Tim Peters8a9c2842001-09-22 21:30:22 +0000511>>> result = list(d.compare(text1, text2))
Fred Drake6943a292001-08-13 19:31:59 +0000512\end{verbatim}
513
514\code{result} is a list of strings, so let's pretty-print it:
515
516\begin{verbatim}
517>>> from pprint import pprint
518>>> pprint(result)
519[' 1. Beautiful is better than ugly.\n',
520 '- 2. Explicit is better than implicit.\n',
521 '- 3. Simple is better than complex.\n',
522 '+ 3. Simple is better than complex.\n',
523 '? ++ \n',
524 '- 4. Complex is better than complicated.\n',
525 '? ^ ---- ^ \n',
526 '+ 4. Complicated is better than complex.\n',
527 '? ++++ ^ ^ \n',
528 '+ 5. Flat is better than nested.\n']
529\end{verbatim}
530
531As a single multi-line string it looks like this:
532
533\begin{verbatim}
534>>> import sys
535>>> sys.stdout.writelines(result)
536 1. Beautiful is better than ugly.
537- 2. Explicit is better than implicit.
538- 3. Simple is better than complex.
539+ 3. Simple is better than complex.
540? ++
541- 4. Complex is better than complicated.
542? ^ ---- ^
543+ 4. Complicated is better than complex.
544? ++++ ^ ^
545+ 5. Flat is better than nested.
546\end{verbatim}