blob: 0c35d6025e11e62b764c0332af7bfc127baa2450 [file] [log] [blame]
Fred Drakebaf71422001-02-19 16:31:02 +00001\section{\module{difflib} ---
2 Helpers for computing deltas}
3
4\declaremodule{standard}{difflib}
5\modulesynopsis{Helpers for computing differences between objects.}
6\moduleauthor{Tim Peters}{tim.one@home.com}
7\sectionauthor{Tim Peters}{tim.one@home.com}
8% LaTeXification by Fred L. Drake, Jr. <fdrake@acm.org>.
9
Fred Drakeda00cda2001-04-10 19:56:09 +000010\versionadded{2.1}
11
12
Fred Drakebaf71422001-02-19 16:31:02 +000013\begin{funcdesc}{get_close_matches}{word, possibilities\optional{,
14 n\optional{, cutoff}}}
15 Return a list of the best ``good enough'' matches. \var{word} is a
16 sequence for which close matches are desired (typically a string),
17 and \var{possibilities} is a list of sequences against which to
18 match \var{word} (typically a list of strings).
19
20 Optional argument \var{n} (default \code{3}) is the maximum number
21 of close matches to return; \var{n} must be greater than \code{0}.
22
23 Optional argument \var{cutoff} (default \code{0.6}) is a float in
24 the range [0, 1]. Possibilities that don't score at least that
25 similar to \var{word} are ignored.
26
27 The best (no more than \var{n}) matches among the possibilities are
28 returned in a list, sorted by similarity score, most similar first.
29
30\begin{verbatim}
31>>> get_close_matches('appel', ['ape', 'apple', 'peach', 'puppy'])
32['apple', 'ape']
33>>> import keyword
34>>> get_close_matches('wheel', keyword.kwlist)
35['while']
36>>> get_close_matches('apple', keyword.kwlist)
37[]
38>>> get_close_matches('accept', keyword.kwlist)
39['except']
40\end{verbatim}
41\end{funcdesc}
42
43\begin{classdesc}{SequenceMatcher}{\unspecified}
44 This is a flexible class for comparing pairs of sequences of any
45 type, so long as the sequence elements are hashable. The basic
46 algorithm predates, and is a little fancier than, an algorithm
47 published in the late 1980's by Ratcliff and Obershelp under the
48 hyperbolic name ``gestalt pattern matching.'' The idea is to find
49 the longest contiguous matching subsequence that contains no
50 ``junk'' elements (the Ratcliff and Obershelp algorithm doesn't
51 address junk). The same idea is then applied recursively to the
52 pieces of the sequences to the left and to the right of the matching
53 subsequence. This does not yield minimal edit sequences, but does
54 tend to yield matches that ``look right'' to people.
55
56 \strong{Timing:} The basic Ratcliff-Obershelp algorithm is cubic
57 time in the worst case and quadratic time in the expected case.
58 \class{SequenceMatcher} is quadratic time for the worst case and has
Tim Peters754ba582001-02-20 11:24:35 +000059 expected-case behavior dependent in a complicated way on how many
60 elements the sequences have in common; best case time is linear.
Fred Drakebaf71422001-02-19 16:31:02 +000061\end{classdesc}
62
63
Fred Drake6fda3ac2001-04-10 18:41:16 +000064\begin{seealso}
65 \seetitle{Pattern Matching: The Gestalt Approach}{Discussion of a
66 similar algorithm by John W. Ratcliff and D. E. Metzener.
67 This was published in
68 \citetitle[http://www.ddj.com/]{Dr. Dobb's Journal} in
69 July, 1988.}
70\end{seealso}
71
72
Fred Drakebaf71422001-02-19 16:31:02 +000073\subsection{SequenceMatcher Objects \label{sequence-matcher}}
74
75\begin{classdesc}{SequenceMatcher}{\optional{isjunk\optional{,
76 a\optional{, b}}}}
77 Optional argument \var{isjunk} must be \code{None} (the default) or
78 a one-argument function that takes a sequence element and returns
79 true if and only if the element is ``junk'' and should be ignored.
80 \code{None} is equivalent to passing \code{lambda x: 0}, i.e.\ no
81 elements are ignored. For example, pass
82
83\begin{verbatim}
Fred Drake447f5452001-02-23 19:13:07 +000084lambda x: x in " \t"
Fred Drakebaf71422001-02-19 16:31:02 +000085\end{verbatim}
86
87 if you're comparing lines as sequences of characters, and don't want
88 to synch up on blanks or hard tabs.
89
90 The optional arguments \var{a} and \var{b} are sequences to be
91 compared; both default to empty strings. The elements of both
92 sequences must be hashable.
93\end{classdesc}
94
95
96\class{SequenceMatcher} objects have the following methods:
97
98\begin{methoddesc}{set_seqs}{a, b}
99 Set the two sequences to be compared.
100\end{methoddesc}
101
102\class{SequenceMatcher} computes and caches detailed information about
103the second sequence, so if you want to compare one sequence against
104many sequences, use \method{set_seq2()} to set the commonly used
105sequence once and call \method{set_seq1()} repeatedly, once for each
106of the other sequences.
107
108\begin{methoddesc}{set_seq1}{a}
109 Set the first sequence to be compared. The second sequence to be
110 compared is not changed.
111\end{methoddesc}
112
113\begin{methoddesc}{set_seq2}{b}
114 Set the second sequence to be compared. The first sequence to be
115 compared is not changed.
116\end{methoddesc}
117
118\begin{methoddesc}{find_longest_match}{alo, ahi, blo, bhi}
119 Find longest matching block in \code{\var{a}[\var{alo}:\var{ahi}]}
120 and \code{\var{b}[\var{blo}:\var{bhi}]}.
121
122 If \var{isjunk} was omitted or \code{None},
123 \method{get_longest_match()} returns \code{(\var{i}, \var{j},
124 \var{k})} such that \code{\var{a}[\var{i}:\var{i}+\var{k}]} is equal
125 to \code{\var{b}[\var{j}:\var{j}+\var{k}]}, where
126 \code{\var{alo} <= \var{i} <= \var{i}+\var{k} <= \var{ahi}} and
127 \code{\var{blo} <= \var{j} <= \var{j}+\var{k} <= \var{bhi}}.
128 For all \code{(\var{i'}, \var{j'}, \var{k'})} meeting those
129 conditions, the additional conditions
130 \code{\var{k} >= \var{k'}},
131 \code{\var{i} <= \var{i'}},
132 and if \code{\var{i} == \var{i'}}, \code{\var{j} <= \var{j'}}
133 are also met.
134 In other words, of all maximal matching blocks, return one that
135 starts earliest in \var{a}, and of all those maximal matching blocks
136 that start earliest in \var{a}, return the one that starts earliest
137 in \var{b}.
138
139\begin{verbatim}
140>>> s = SequenceMatcher(None, " abcd", "abcd abcd")
141>>> s.find_longest_match(0, 5, 0, 9)
142(0, 4, 5)
143\end{verbatim}
144
145 If \var{isjunk} was provided, first the longest matching block is
146 determined as above, but with the additional restriction that no
147 junk element appears in the block. Then that block is extended as
148 far as possible by matching (only) junk elements on both sides.
149 So the resulting block never matches on junk except as identical
150 junk happens to be adjacent to an interesting match.
151
152 Here's the same example as before, but considering blanks to be junk.
Tim Peters754ba582001-02-20 11:24:35 +0000153 That prevents \code{' abcd'} from matching the \code{' abcd'} at the
Fred Drakebaf71422001-02-19 16:31:02 +0000154 tail end of the second sequence directly. Instead only the
155 \code{'abcd'} can match, and matches the leftmost \code{'abcd'} in
156 the second sequence:
157
158\begin{verbatim}
159>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
160>>> s.find_longest_match(0, 5, 0, 9)
161(1, 0, 4)
162\end{verbatim}
163
164 If no blocks match, this returns \code{(\var{alo}, \var{blo}, 0)}.
165\end{methoddesc}
166
167\begin{methoddesc}{get_matching_blocks}{}
168 Return list of triples describing matching subsequences.
169 Each triple is of the form \code{(\var{i}, \var{j}, \var{n})}, and
170 means that \code{\var{a}[\var{i}:\var{i}+\var{n}] ==
171 \var{b}[\var{j}:\var{j}+\var{n}]}. The triples are monotonically
172 increasing in \var{i} and \var{j}.
173
174 The last triple is a dummy, and has the value \code{(len(\var{a}),
175 len(\var{b}), 0)}. It is the only triple with \code{\var{n} == 0}.
176 % Explain why a dummy is used!
177
178\begin{verbatim}
179>>> s = SequenceMatcher(None, "abxcd", "abcd")
180>>> s.get_matching_blocks()
181[(0, 0, 2), (3, 2, 2), (5, 4, 0)]
182\end{verbatim}
183\end{methoddesc}
184
185\begin{methoddesc}{get_opcodes}{}
186 Return list of 5-tuples describing how to turn \var{a} into \var{b}.
187 Each tuple is of the form \code{(\var{tag}, \var{i1}, \var{i2},
188 \var{j1}, \var{j2})}. The first tuple has \code{\var{i1} ==
189 \var{j1} == 0}, and remaining tuples have \var{i1} equal to the
190 \var{i2} from the preceeding tuple, and, likewise, \var{j1} equal to
191 the previous \var{j2}.
192
193 The \var{tag} values are strings, with these meanings:
194
195\begin{tableii}{l|l}{code}{Value}{Meaning}
196 \lineii{'replace'}{\code{\var{a}[\var{i1}:\var{i2}]} should be
197 replaced by \code{\var{b}[\var{j1}:\var{j2}]}.}
198 \lineii{'delete'}{\code{\var{a}[\var{i1}:\var{i2}]} should be
199 deleted. Note that \code{\var{j1} == \var{j2}} in
200 this case.}
201 \lineii{'insert'}{\code{\var{b}[\var{j1}:\var{j2}]} should be
202 inserted at \code{\var{a}[\var{i1}:\var{i1}]}.
203 Note that \code{\var{i1} == \var{i2}} in this
204 case.}
205 \lineii{'equal'}{\code{\var{a}[\var{i1}:\var{i2}] ==
206 \var{b}[\var{j1}:\var{j2}]} (the sub-sequences are
207 equal).}
208\end{tableii}
209
210For example:
211
212\begin{verbatim}
213>>> a = "qabxcd"
214>>> b = "abycdf"
215>>> s = SequenceMatcher(None, a, b)
216>>> for tag, i1, i2, j1, j2 in s.get_opcodes():
217... print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
218... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
219 delete a[0:1] (q) b[0:0] ()
220 equal a[1:3] (ab) b[0:2] (ab)
221replace a[3:4] (x) b[2:3] (y)
222 equal a[4:6] (cd) b[3:5] (cd)
223 insert a[6:6] () b[5:6] (f)
224\end{verbatim}
225\end{methoddesc}
226
227\begin{methoddesc}{ratio}{}
228 Return a measure of the sequences' similarity as a float in the
229 range [0, 1].
230
231 Where T is the total number of elements in both sequences, and M is
Tim Peters754ba582001-02-20 11:24:35 +0000232 the number of matches, this is 2.0*M / T. Note that this is \code{1.}
233 if the sequences are identical, and \code{0.} if they have nothing in
Fred Drakebaf71422001-02-19 16:31:02 +0000234 common.
235
236 This is expensive to compute if \method{get_matching_blocks()} or
237 \method{get_opcodes()} hasn't already been called, in which case you
238 may want to try \method{quick_ratio()} or
239 \method{real_quick_ratio()} first to get an upper bound.
240\end{methoddesc}
241
242\begin{methoddesc}{quick_ratio}{}
243 Return an upper bound on \method{ratio()} relatively quickly.
244
245 This isn't defined beyond that it is an upper bound on
246 \method{ratio()}, and is faster to compute.
247\end{methoddesc}
248
249\begin{methoddesc}{real_quick_ratio}{}
250 Return an upper bound on \method{ratio()} very quickly.
251
252 This isn't defined beyond that it is an upper bound on
253 \method{ratio()}, and is faster to compute than either
254 \method{ratio()} or \method{quick_ratio()}.
255\end{methoddesc}
256
Tim Peters754ba582001-02-20 11:24:35 +0000257The three methods that return the ratio of matching to total characters
258can give different results due to differing levels of approximation,
259although \method{quick_ratio()} and \method{real_quick_ratio()} are always
260at least as large as \method{ratio()}:
Fred Drakebaf71422001-02-19 16:31:02 +0000261
262\begin{verbatim}
263>>> s = SequenceMatcher(None, "abcd", "bcde")
264>>> s.ratio()
2650.75
266>>> s.quick_ratio()
2670.75
268>>> s.real_quick_ratio()
2691.0
270\end{verbatim}
271
272
273\subsection{Examples \label{difflib-examples}}
274
275
276This example compares two strings, considering blanks to be ``junk:''
277
278\begin{verbatim}
279>>> s = SequenceMatcher(lambda x: x == " ",
280... "private Thread currentThread;",
281... "private volatile Thread currentThread;")
282\end{verbatim}
283
284\method{ratio()} returns a float in [0, 1], measuring the similarity
285of the sequences. As a rule of thumb, a \method{ratio()} value over
2860.6 means the sequences are close matches:
287
288\begin{verbatim}
289>>> print round(s.ratio(), 3)
2900.866
291\end{verbatim}
292
293If you're only interested in where the sequences match,
294\method{get_matching_blocks()} is handy:
295
296\begin{verbatim}
297>>> for block in s.get_matching_blocks():
298... print "a[%d] and b[%d] match for %d elements" % block
299a[0] and b[0] match for 8 elements
300a[8] and b[17] match for 6 elements
301a[14] and b[23] match for 15 elements
302a[29] and b[38] match for 0 elements
303\end{verbatim}
304
305Note that the last tuple returned by \method{get_matching_blocks()} is
306always a dummy, \code{(len(\var{a}), len(\var{b}), 0)}, and this is
307the only case in which the last tuple element (number of elements
308matched) is \code{0}.
309
310If you want to know how to change the first sequence into the second,
311use \method{get_opcodes()}:
312
313\begin{verbatim}
314>>> for opcode in s.get_opcodes():
315... print "%6s a[%d:%d] b[%d:%d]" % opcode
316 equal a[0:8] b[0:8]
317insert a[8:8] b[8:17]
318 equal a[8:14] b[17:23]
319 equal a[14:29] b[23:38]
320\end{verbatim}
321
322See \file{Tools/scripts/ndiff.py} from the Python source distribution
323for a fancy human-friendly file differencer, which uses
324\class{SequenceMatcher} both to view files as sequences of lines, and
325lines as sequences of characters.
326
327See also the function \function{get_close_matches()} in this module,
328which shows how simple code building on \class{SequenceMatcher} can be
329used to do useful work.