blob: ce191c0231720426fdf46f61d69586bd1f6a2b85 [file] [log] [blame]
Fred Drakebaf71422001-02-19 16:31:02 +00001\section{\module{difflib} ---
2 Helpers for computing deltas}
3
4\declaremodule{standard}{difflib}
5\modulesynopsis{Helpers for computing differences between objects.}
6\moduleauthor{Tim Peters}{tim.one@home.com}
7\sectionauthor{Tim Peters}{tim.one@home.com}
8% LaTeXification by Fred L. Drake, Jr. <fdrake@acm.org>.
9
Fred Drakeda00cda2001-04-10 19:56:09 +000010\versionadded{2.1}
11
12
Fred Drakebaf71422001-02-19 16:31:02 +000013\begin{funcdesc}{get_close_matches}{word, possibilities\optional{,
14 n\optional{, cutoff}}}
15 Return a list of the best ``good enough'' matches. \var{word} is a
16 sequence for which close matches are desired (typically a string),
17 and \var{possibilities} is a list of sequences against which to
18 match \var{word} (typically a list of strings).
19
20 Optional argument \var{n} (default \code{3}) is the maximum number
21 of close matches to return; \var{n} must be greater than \code{0}.
22
23 Optional argument \var{cutoff} (default \code{0.6}) is a float in
24 the range [0, 1]. Possibilities that don't score at least that
25 similar to \var{word} are ignored.
26
27 The best (no more than \var{n}) matches among the possibilities are
28 returned in a list, sorted by similarity score, most similar first.
29
30\begin{verbatim}
31>>> get_close_matches('appel', ['ape', 'apple', 'peach', 'puppy'])
32['apple', 'ape']
33>>> import keyword
34>>> get_close_matches('wheel', keyword.kwlist)
35['while']
36>>> get_close_matches('apple', keyword.kwlist)
37[]
38>>> get_close_matches('accept', keyword.kwlist)
39['except']
40\end{verbatim}
41\end{funcdesc}
42
Fred Drake96d7a702001-05-11 01:08:13 +000043\begin{classdesc*}{SequenceMatcher}
Fred Drakebaf71422001-02-19 16:31:02 +000044 This is a flexible class for comparing pairs of sequences of any
45 type, so long as the sequence elements are hashable. The basic
46 algorithm predates, and is a little fancier than, an algorithm
47 published in the late 1980's by Ratcliff and Obershelp under the
48 hyperbolic name ``gestalt pattern matching.'' The idea is to find
49 the longest contiguous matching subsequence that contains no
50 ``junk'' elements (the Ratcliff and Obershelp algorithm doesn't
51 address junk). The same idea is then applied recursively to the
52 pieces of the sequences to the left and to the right of the matching
53 subsequence. This does not yield minimal edit sequences, but does
54 tend to yield matches that ``look right'' to people.
55
56 \strong{Timing:} The basic Ratcliff-Obershelp algorithm is cubic
57 time in the worst case and quadratic time in the expected case.
58 \class{SequenceMatcher} is quadratic time for the worst case and has
Tim Peters754ba582001-02-20 11:24:35 +000059 expected-case behavior dependent in a complicated way on how many
60 elements the sequences have in common; best case time is linear.
Fred Drake886f1132001-05-11 15:49:19 +000061\end{classdesc*}
Fred Drakebaf71422001-02-19 16:31:02 +000062
63
Fred Drake6fda3ac2001-04-10 18:41:16 +000064\begin{seealso}
65 \seetitle{Pattern Matching: The Gestalt Approach}{Discussion of a
66 similar algorithm by John W. Ratcliff and D. E. Metzener.
67 This was published in
68 \citetitle[http://www.ddj.com/]{Dr. Dobb's Journal} in
69 July, 1988.}
70\end{seealso}
71
72
Fred Drakebaf71422001-02-19 16:31:02 +000073\subsection{SequenceMatcher Objects \label{sequence-matcher}}
74
Fred Drake96d7a702001-05-11 01:08:13 +000075The \class{SequenceMatcher} class has this constructor:
76
Fred Drakebaf71422001-02-19 16:31:02 +000077\begin{classdesc}{SequenceMatcher}{\optional{isjunk\optional{,
78 a\optional{, b}}}}
79 Optional argument \var{isjunk} must be \code{None} (the default) or
80 a one-argument function that takes a sequence element and returns
81 true if and only if the element is ``junk'' and should be ignored.
82 \code{None} is equivalent to passing \code{lambda x: 0}, i.e.\ no
83 elements are ignored. For example, pass
84
85\begin{verbatim}
Fred Drake447f5452001-02-23 19:13:07 +000086lambda x: x in " \t"
Fred Drakebaf71422001-02-19 16:31:02 +000087\end{verbatim}
88
89 if you're comparing lines as sequences of characters, and don't want
90 to synch up on blanks or hard tabs.
91
92 The optional arguments \var{a} and \var{b} are sequences to be
93 compared; both default to empty strings. The elements of both
94 sequences must be hashable.
95\end{classdesc}
96
97
98\class{SequenceMatcher} objects have the following methods:
99
100\begin{methoddesc}{set_seqs}{a, b}
101 Set the two sequences to be compared.
102\end{methoddesc}
103
104\class{SequenceMatcher} computes and caches detailed information about
105the second sequence, so if you want to compare one sequence against
106many sequences, use \method{set_seq2()} to set the commonly used
107sequence once and call \method{set_seq1()} repeatedly, once for each
108of the other sequences.
109
110\begin{methoddesc}{set_seq1}{a}
111 Set the first sequence to be compared. The second sequence to be
112 compared is not changed.
113\end{methoddesc}
114
115\begin{methoddesc}{set_seq2}{b}
116 Set the second sequence to be compared. The first sequence to be
117 compared is not changed.
118\end{methoddesc}
119
120\begin{methoddesc}{find_longest_match}{alo, ahi, blo, bhi}
121 Find longest matching block in \code{\var{a}[\var{alo}:\var{ahi}]}
122 and \code{\var{b}[\var{blo}:\var{bhi}]}.
123
124 If \var{isjunk} was omitted or \code{None},
125 \method{get_longest_match()} returns \code{(\var{i}, \var{j},
126 \var{k})} such that \code{\var{a}[\var{i}:\var{i}+\var{k}]} is equal
127 to \code{\var{b}[\var{j}:\var{j}+\var{k}]}, where
128 \code{\var{alo} <= \var{i} <= \var{i}+\var{k} <= \var{ahi}} and
129 \code{\var{blo} <= \var{j} <= \var{j}+\var{k} <= \var{bhi}}.
130 For all \code{(\var{i'}, \var{j'}, \var{k'})} meeting those
131 conditions, the additional conditions
132 \code{\var{k} >= \var{k'}},
133 \code{\var{i} <= \var{i'}},
134 and if \code{\var{i} == \var{i'}}, \code{\var{j} <= \var{j'}}
135 are also met.
136 In other words, of all maximal matching blocks, return one that
137 starts earliest in \var{a}, and of all those maximal matching blocks
138 that start earliest in \var{a}, return the one that starts earliest
139 in \var{b}.
140
141\begin{verbatim}
142>>> s = SequenceMatcher(None, " abcd", "abcd abcd")
143>>> s.find_longest_match(0, 5, 0, 9)
144(0, 4, 5)
145\end{verbatim}
146
147 If \var{isjunk} was provided, first the longest matching block is
148 determined as above, but with the additional restriction that no
149 junk element appears in the block. Then that block is extended as
150 far as possible by matching (only) junk elements on both sides.
151 So the resulting block never matches on junk except as identical
152 junk happens to be adjacent to an interesting match.
153
154 Here's the same example as before, but considering blanks to be junk.
Tim Peters754ba582001-02-20 11:24:35 +0000155 That prevents \code{' abcd'} from matching the \code{' abcd'} at the
Fred Drakebaf71422001-02-19 16:31:02 +0000156 tail end of the second sequence directly. Instead only the
157 \code{'abcd'} can match, and matches the leftmost \code{'abcd'} in
158 the second sequence:
159
160\begin{verbatim}
161>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
162>>> s.find_longest_match(0, 5, 0, 9)
163(1, 0, 4)
164\end{verbatim}
165
166 If no blocks match, this returns \code{(\var{alo}, \var{blo}, 0)}.
167\end{methoddesc}
168
169\begin{methoddesc}{get_matching_blocks}{}
170 Return list of triples describing matching subsequences.
171 Each triple is of the form \code{(\var{i}, \var{j}, \var{n})}, and
172 means that \code{\var{a}[\var{i}:\var{i}+\var{n}] ==
173 \var{b}[\var{j}:\var{j}+\var{n}]}. The triples are monotonically
174 increasing in \var{i} and \var{j}.
175
176 The last triple is a dummy, and has the value \code{(len(\var{a}),
177 len(\var{b}), 0)}. It is the only triple with \code{\var{n} == 0}.
178 % Explain why a dummy is used!
179
180\begin{verbatim}
181>>> s = SequenceMatcher(None, "abxcd", "abcd")
182>>> s.get_matching_blocks()
183[(0, 0, 2), (3, 2, 2), (5, 4, 0)]
184\end{verbatim}
185\end{methoddesc}
186
187\begin{methoddesc}{get_opcodes}{}
188 Return list of 5-tuples describing how to turn \var{a} into \var{b}.
189 Each tuple is of the form \code{(\var{tag}, \var{i1}, \var{i2},
190 \var{j1}, \var{j2})}. The first tuple has \code{\var{i1} ==
191 \var{j1} == 0}, and remaining tuples have \var{i1} equal to the
192 \var{i2} from the preceeding tuple, and, likewise, \var{j1} equal to
193 the previous \var{j2}.
194
195 The \var{tag} values are strings, with these meanings:
196
197\begin{tableii}{l|l}{code}{Value}{Meaning}
198 \lineii{'replace'}{\code{\var{a}[\var{i1}:\var{i2}]} should be
199 replaced by \code{\var{b}[\var{j1}:\var{j2}]}.}
200 \lineii{'delete'}{\code{\var{a}[\var{i1}:\var{i2}]} should be
201 deleted. Note that \code{\var{j1} == \var{j2}} in
202 this case.}
203 \lineii{'insert'}{\code{\var{b}[\var{j1}:\var{j2}]} should be
204 inserted at \code{\var{a}[\var{i1}:\var{i1}]}.
205 Note that \code{\var{i1} == \var{i2}} in this
206 case.}
207 \lineii{'equal'}{\code{\var{a}[\var{i1}:\var{i2}] ==
208 \var{b}[\var{j1}:\var{j2}]} (the sub-sequences are
209 equal).}
210\end{tableii}
211
212For example:
213
214\begin{verbatim}
215>>> a = "qabxcd"
216>>> b = "abycdf"
217>>> s = SequenceMatcher(None, a, b)
218>>> for tag, i1, i2, j1, j2 in s.get_opcodes():
219... print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
220... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
221 delete a[0:1] (q) b[0:0] ()
222 equal a[1:3] (ab) b[0:2] (ab)
223replace a[3:4] (x) b[2:3] (y)
224 equal a[4:6] (cd) b[3:5] (cd)
225 insert a[6:6] () b[5:6] (f)
226\end{verbatim}
227\end{methoddesc}
228
229\begin{methoddesc}{ratio}{}
230 Return a measure of the sequences' similarity as a float in the
231 range [0, 1].
232
233 Where T is the total number of elements in both sequences, and M is
Tim Peters754ba582001-02-20 11:24:35 +0000234 the number of matches, this is 2.0*M / T. Note that this is \code{1.}
235 if the sequences are identical, and \code{0.} if they have nothing in
Fred Drakebaf71422001-02-19 16:31:02 +0000236 common.
237
238 This is expensive to compute if \method{get_matching_blocks()} or
239 \method{get_opcodes()} hasn't already been called, in which case you
240 may want to try \method{quick_ratio()} or
241 \method{real_quick_ratio()} first to get an upper bound.
242\end{methoddesc}
243
244\begin{methoddesc}{quick_ratio}{}
245 Return an upper bound on \method{ratio()} relatively quickly.
246
247 This isn't defined beyond that it is an upper bound on
248 \method{ratio()}, and is faster to compute.
249\end{methoddesc}
250
251\begin{methoddesc}{real_quick_ratio}{}
252 Return an upper bound on \method{ratio()} very quickly.
253
254 This isn't defined beyond that it is an upper bound on
255 \method{ratio()}, and is faster to compute than either
256 \method{ratio()} or \method{quick_ratio()}.
257\end{methoddesc}
258
Tim Peters754ba582001-02-20 11:24:35 +0000259The three methods that return the ratio of matching to total characters
260can give different results due to differing levels of approximation,
261although \method{quick_ratio()} and \method{real_quick_ratio()} are always
262at least as large as \method{ratio()}:
Fred Drakebaf71422001-02-19 16:31:02 +0000263
264\begin{verbatim}
265>>> s = SequenceMatcher(None, "abcd", "bcde")
266>>> s.ratio()
2670.75
268>>> s.quick_ratio()
2690.75
270>>> s.real_quick_ratio()
2711.0
272\end{verbatim}
273
274
275\subsection{Examples \label{difflib-examples}}
276
277
278This example compares two strings, considering blanks to be ``junk:''
279
280\begin{verbatim}
281>>> s = SequenceMatcher(lambda x: x == " ",
282... "private Thread currentThread;",
283... "private volatile Thread currentThread;")
284\end{verbatim}
285
286\method{ratio()} returns a float in [0, 1], measuring the similarity
287of the sequences. As a rule of thumb, a \method{ratio()} value over
2880.6 means the sequences are close matches:
289
290\begin{verbatim}
291>>> print round(s.ratio(), 3)
2920.866
293\end{verbatim}
294
295If you're only interested in where the sequences match,
296\method{get_matching_blocks()} is handy:
297
298\begin{verbatim}
299>>> for block in s.get_matching_blocks():
300... print "a[%d] and b[%d] match for %d elements" % block
301a[0] and b[0] match for 8 elements
302a[8] and b[17] match for 6 elements
303a[14] and b[23] match for 15 elements
304a[29] and b[38] match for 0 elements
305\end{verbatim}
306
307Note that the last tuple returned by \method{get_matching_blocks()} is
308always a dummy, \code{(len(\var{a}), len(\var{b}), 0)}, and this is
309the only case in which the last tuple element (number of elements
310matched) is \code{0}.
311
312If you want to know how to change the first sequence into the second,
313use \method{get_opcodes()}:
314
315\begin{verbatim}
316>>> for opcode in s.get_opcodes():
317... print "%6s a[%d:%d] b[%d:%d]" % opcode
318 equal a[0:8] b[0:8]
319insert a[8:8] b[8:17]
320 equal a[8:14] b[17:23]
321 equal a[14:29] b[23:38]
322\end{verbatim}
323
324See \file{Tools/scripts/ndiff.py} from the Python source distribution
325for a fancy human-friendly file differencer, which uses
326\class{SequenceMatcher} both to view files as sequences of lines, and
327lines as sequences of characters.
328
329See also the function \function{get_close_matches()} in this module,
330which shows how simple code building on \class{SequenceMatcher} can be
331used to do useful work.