blob: a58ae8e78c20bee563443132745fc652081b5549 [file] [log] [blame]
Fred Drakebaf71422001-02-19 16:31:02 +00001\section{\module{difflib} ---
2 Helpers for computing deltas}
3
4\declaremodule{standard}{difflib}
5\modulesynopsis{Helpers for computing differences between objects.}
6\moduleauthor{Tim Peters}{tim.one@home.com}
7\sectionauthor{Tim Peters}{tim.one@home.com}
8% LaTeXification by Fred L. Drake, Jr. <fdrake@acm.org>.
9
10\begin{funcdesc}{get_close_matches}{word, possibilities\optional{,
11 n\optional{, cutoff}}}
12 Return a list of the best ``good enough'' matches. \var{word} is a
13 sequence for which close matches are desired (typically a string),
14 and \var{possibilities} is a list of sequences against which to
15 match \var{word} (typically a list of strings).
16
17 Optional argument \var{n} (default \code{3}) is the maximum number
18 of close matches to return; \var{n} must be greater than \code{0}.
19
20 Optional argument \var{cutoff} (default \code{0.6}) is a float in
21 the range [0, 1]. Possibilities that don't score at least that
22 similar to \var{word} are ignored.
23
24 The best (no more than \var{n}) matches among the possibilities are
25 returned in a list, sorted by similarity score, most similar first.
26
27\begin{verbatim}
28>>> get_close_matches('appel', ['ape', 'apple', 'peach', 'puppy'])
29['apple', 'ape']
30>>> import keyword
31>>> get_close_matches('wheel', keyword.kwlist)
32['while']
33>>> get_close_matches('apple', keyword.kwlist)
34[]
35>>> get_close_matches('accept', keyword.kwlist)
36['except']
37\end{verbatim}
38\end{funcdesc}
39
40\begin{classdesc}{SequenceMatcher}{\unspecified}
41 This is a flexible class for comparing pairs of sequences of any
42 type, so long as the sequence elements are hashable. The basic
43 algorithm predates, and is a little fancier than, an algorithm
44 published in the late 1980's by Ratcliff and Obershelp under the
45 hyperbolic name ``gestalt pattern matching.'' The idea is to find
46 the longest contiguous matching subsequence that contains no
47 ``junk'' elements (the Ratcliff and Obershelp algorithm doesn't
48 address junk). The same idea is then applied recursively to the
49 pieces of the sequences to the left and to the right of the matching
50 subsequence. This does not yield minimal edit sequences, but does
51 tend to yield matches that ``look right'' to people.
52
53 \strong{Timing:} The basic Ratcliff-Obershelp algorithm is cubic
54 time in the worst case and quadratic time in the expected case.
55 \class{SequenceMatcher} is quadratic time for the worst case and has
Tim Peters754ba582001-02-20 11:24:35 +000056 expected-case behavior dependent in a complicated way on how many
57 elements the sequences have in common; best case time is linear.
Fred Drakebaf71422001-02-19 16:31:02 +000058\end{classdesc}
59
60
61\subsection{SequenceMatcher Objects \label{sequence-matcher}}
62
63\begin{classdesc}{SequenceMatcher}{\optional{isjunk\optional{,
64 a\optional{, b}}}}
65 Optional argument \var{isjunk} must be \code{None} (the default) or
66 a one-argument function that takes a sequence element and returns
67 true if and only if the element is ``junk'' and should be ignored.
68 \code{None} is equivalent to passing \code{lambda x: 0}, i.e.\ no
69 elements are ignored. For example, pass
70
71\begin{verbatim}
Fred Drake447f5452001-02-23 19:13:07 +000072lambda x: x in " \t"
Fred Drakebaf71422001-02-19 16:31:02 +000073\end{verbatim}
74
75 if you're comparing lines as sequences of characters, and don't want
76 to synch up on blanks or hard tabs.
77
78 The optional arguments \var{a} and \var{b} are sequences to be
79 compared; both default to empty strings. The elements of both
80 sequences must be hashable.
81\end{classdesc}
82
83
84\class{SequenceMatcher} objects have the following methods:
85
86\begin{methoddesc}{set_seqs}{a, b}
87 Set the two sequences to be compared.
88\end{methoddesc}
89
90\class{SequenceMatcher} computes and caches detailed information about
91the second sequence, so if you want to compare one sequence against
92many sequences, use \method{set_seq2()} to set the commonly used
93sequence once and call \method{set_seq1()} repeatedly, once for each
94of the other sequences.
95
96\begin{methoddesc}{set_seq1}{a}
97 Set the first sequence to be compared. The second sequence to be
98 compared is not changed.
99\end{methoddesc}
100
101\begin{methoddesc}{set_seq2}{b}
102 Set the second sequence to be compared. The first sequence to be
103 compared is not changed.
104\end{methoddesc}
105
106\begin{methoddesc}{find_longest_match}{alo, ahi, blo, bhi}
107 Find longest matching block in \code{\var{a}[\var{alo}:\var{ahi}]}
108 and \code{\var{b}[\var{blo}:\var{bhi}]}.
109
110 If \var{isjunk} was omitted or \code{None},
111 \method{get_longest_match()} returns \code{(\var{i}, \var{j},
112 \var{k})} such that \code{\var{a}[\var{i}:\var{i}+\var{k}]} is equal
113 to \code{\var{b}[\var{j}:\var{j}+\var{k}]}, where
114 \code{\var{alo} <= \var{i} <= \var{i}+\var{k} <= \var{ahi}} and
115 \code{\var{blo} <= \var{j} <= \var{j}+\var{k} <= \var{bhi}}.
116 For all \code{(\var{i'}, \var{j'}, \var{k'})} meeting those
117 conditions, the additional conditions
118 \code{\var{k} >= \var{k'}},
119 \code{\var{i} <= \var{i'}},
120 and if \code{\var{i} == \var{i'}}, \code{\var{j} <= \var{j'}}
121 are also met.
122 In other words, of all maximal matching blocks, return one that
123 starts earliest in \var{a}, and of all those maximal matching blocks
124 that start earliest in \var{a}, return the one that starts earliest
125 in \var{b}.
126
127\begin{verbatim}
128>>> s = SequenceMatcher(None, " abcd", "abcd abcd")
129>>> s.find_longest_match(0, 5, 0, 9)
130(0, 4, 5)
131\end{verbatim}
132
133 If \var{isjunk} was provided, first the longest matching block is
134 determined as above, but with the additional restriction that no
135 junk element appears in the block. Then that block is extended as
136 far as possible by matching (only) junk elements on both sides.
137 So the resulting block never matches on junk except as identical
138 junk happens to be adjacent to an interesting match.
139
140 Here's the same example as before, but considering blanks to be junk.
Tim Peters754ba582001-02-20 11:24:35 +0000141 That prevents \code{' abcd'} from matching the \code{' abcd'} at the
Fred Drakebaf71422001-02-19 16:31:02 +0000142 tail end of the second sequence directly. Instead only the
143 \code{'abcd'} can match, and matches the leftmost \code{'abcd'} in
144 the second sequence:
145
146\begin{verbatim}
147>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
148>>> s.find_longest_match(0, 5, 0, 9)
149(1, 0, 4)
150\end{verbatim}
151
152 If no blocks match, this returns \code{(\var{alo}, \var{blo}, 0)}.
153\end{methoddesc}
154
155\begin{methoddesc}{get_matching_blocks}{}
156 Return list of triples describing matching subsequences.
157 Each triple is of the form \code{(\var{i}, \var{j}, \var{n})}, and
158 means that \code{\var{a}[\var{i}:\var{i}+\var{n}] ==
159 \var{b}[\var{j}:\var{j}+\var{n}]}. The triples are monotonically
160 increasing in \var{i} and \var{j}.
161
162 The last triple is a dummy, and has the value \code{(len(\var{a}),
163 len(\var{b}), 0)}. It is the only triple with \code{\var{n} == 0}.
164 % Explain why a dummy is used!
165
166\begin{verbatim}
167>>> s = SequenceMatcher(None, "abxcd", "abcd")
168>>> s.get_matching_blocks()
169[(0, 0, 2), (3, 2, 2), (5, 4, 0)]
170\end{verbatim}
171\end{methoddesc}
172
173\begin{methoddesc}{get_opcodes}{}
174 Return list of 5-tuples describing how to turn \var{a} into \var{b}.
175 Each tuple is of the form \code{(\var{tag}, \var{i1}, \var{i2},
176 \var{j1}, \var{j2})}. The first tuple has \code{\var{i1} ==
177 \var{j1} == 0}, and remaining tuples have \var{i1} equal to the
178 \var{i2} from the preceeding tuple, and, likewise, \var{j1} equal to
179 the previous \var{j2}.
180
181 The \var{tag} values are strings, with these meanings:
182
183\begin{tableii}{l|l}{code}{Value}{Meaning}
184 \lineii{'replace'}{\code{\var{a}[\var{i1}:\var{i2}]} should be
185 replaced by \code{\var{b}[\var{j1}:\var{j2}]}.}
186 \lineii{'delete'}{\code{\var{a}[\var{i1}:\var{i2}]} should be
187 deleted. Note that \code{\var{j1} == \var{j2}} in
188 this case.}
189 \lineii{'insert'}{\code{\var{b}[\var{j1}:\var{j2}]} should be
190 inserted at \code{\var{a}[\var{i1}:\var{i1}]}.
191 Note that \code{\var{i1} == \var{i2}} in this
192 case.}
193 \lineii{'equal'}{\code{\var{a}[\var{i1}:\var{i2}] ==
194 \var{b}[\var{j1}:\var{j2}]} (the sub-sequences are
195 equal).}
196\end{tableii}
197
198For example:
199
200\begin{verbatim}
201>>> a = "qabxcd"
202>>> b = "abycdf"
203>>> s = SequenceMatcher(None, a, b)
204>>> for tag, i1, i2, j1, j2 in s.get_opcodes():
205... print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
206... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
207 delete a[0:1] (q) b[0:0] ()
208 equal a[1:3] (ab) b[0:2] (ab)
209replace a[3:4] (x) b[2:3] (y)
210 equal a[4:6] (cd) b[3:5] (cd)
211 insert a[6:6] () b[5:6] (f)
212\end{verbatim}
213\end{methoddesc}
214
215\begin{methoddesc}{ratio}{}
216 Return a measure of the sequences' similarity as a float in the
217 range [0, 1].
218
219 Where T is the total number of elements in both sequences, and M is
Tim Peters754ba582001-02-20 11:24:35 +0000220 the number of matches, this is 2.0*M / T. Note that this is \code{1.}
221 if the sequences are identical, and \code{0.} if they have nothing in
Fred Drakebaf71422001-02-19 16:31:02 +0000222 common.
223
224 This is expensive to compute if \method{get_matching_blocks()} or
225 \method{get_opcodes()} hasn't already been called, in which case you
226 may want to try \method{quick_ratio()} or
227 \method{real_quick_ratio()} first to get an upper bound.
228\end{methoddesc}
229
230\begin{methoddesc}{quick_ratio}{}
231 Return an upper bound on \method{ratio()} relatively quickly.
232
233 This isn't defined beyond that it is an upper bound on
234 \method{ratio()}, and is faster to compute.
235\end{methoddesc}
236
237\begin{methoddesc}{real_quick_ratio}{}
238 Return an upper bound on \method{ratio()} very quickly.
239
240 This isn't defined beyond that it is an upper bound on
241 \method{ratio()}, and is faster to compute than either
242 \method{ratio()} or \method{quick_ratio()}.
243\end{methoddesc}
244
Tim Peters754ba582001-02-20 11:24:35 +0000245The three methods that return the ratio of matching to total characters
246can give different results due to differing levels of approximation,
247although \method{quick_ratio()} and \method{real_quick_ratio()} are always
248at least as large as \method{ratio()}:
Fred Drakebaf71422001-02-19 16:31:02 +0000249
250\begin{verbatim}
251>>> s = SequenceMatcher(None, "abcd", "bcde")
252>>> s.ratio()
2530.75
254>>> s.quick_ratio()
2550.75
256>>> s.real_quick_ratio()
2571.0
258\end{verbatim}
259
260
261\subsection{Examples \label{difflib-examples}}
262
263
264This example compares two strings, considering blanks to be ``junk:''
265
266\begin{verbatim}
267>>> s = SequenceMatcher(lambda x: x == " ",
268... "private Thread currentThread;",
269... "private volatile Thread currentThread;")
270\end{verbatim}
271
272\method{ratio()} returns a float in [0, 1], measuring the similarity
273of the sequences. As a rule of thumb, a \method{ratio()} value over
2740.6 means the sequences are close matches:
275
276\begin{verbatim}
277>>> print round(s.ratio(), 3)
2780.866
279\end{verbatim}
280
281If you're only interested in where the sequences match,
282\method{get_matching_blocks()} is handy:
283
284\begin{verbatim}
285>>> for block in s.get_matching_blocks():
286... print "a[%d] and b[%d] match for %d elements" % block
287a[0] and b[0] match for 8 elements
288a[8] and b[17] match for 6 elements
289a[14] and b[23] match for 15 elements
290a[29] and b[38] match for 0 elements
291\end{verbatim}
292
293Note that the last tuple returned by \method{get_matching_blocks()} is
294always a dummy, \code{(len(\var{a}), len(\var{b}), 0)}, and this is
295the only case in which the last tuple element (number of elements
296matched) is \code{0}.
297
298If you want to know how to change the first sequence into the second,
299use \method{get_opcodes()}:
300
301\begin{verbatim}
302>>> for opcode in s.get_opcodes():
303... print "%6s a[%d:%d] b[%d:%d]" % opcode
304 equal a[0:8] b[0:8]
305insert a[8:8] b[8:17]
306 equal a[8:14] b[17:23]
307 equal a[14:29] b[23:38]
308\end{verbatim}
309
310See \file{Tools/scripts/ndiff.py} from the Python source distribution
311for a fancy human-friendly file differencer, which uses
312\class{SequenceMatcher} both to view files as sequences of lines, and
313lines as sequences of characters.
314
315See also the function \function{get_close_matches()} in this module,
316which shows how simple code building on \class{SequenceMatcher} can be
317used to do useful work.