blob: 128552bc1e0f28b215eba4cfe0f6536959c054d0 [file] [log] [blame]
Fred Drakebaf71422001-02-19 16:31:02 +00001\section{\module{difflib} ---
2 Helpers for computing deltas}
3
4\declaremodule{standard}{difflib}
5\modulesynopsis{Helpers for computing differences between objects.}
6\moduleauthor{Tim Peters}{tim.one@home.com}
7\sectionauthor{Tim Peters}{tim.one@home.com}
8% LaTeXification by Fred L. Drake, Jr. <fdrake@acm.org>.
9
10\begin{funcdesc}{get_close_matches}{word, possibilities\optional{,
11 n\optional{, cutoff}}}
12 Return a list of the best ``good enough'' matches. \var{word} is a
13 sequence for which close matches are desired (typically a string),
14 and \var{possibilities} is a list of sequences against which to
15 match \var{word} (typically a list of strings).
16
17 Optional argument \var{n} (default \code{3}) is the maximum number
18 of close matches to return; \var{n} must be greater than \code{0}.
19
20 Optional argument \var{cutoff} (default \code{0.6}) is a float in
21 the range [0, 1]. Possibilities that don't score at least that
22 similar to \var{word} are ignored.
23
24 The best (no more than \var{n}) matches among the possibilities are
25 returned in a list, sorted by similarity score, most similar first.
26
27\begin{verbatim}
28>>> get_close_matches('appel', ['ape', 'apple', 'peach', 'puppy'])
29['apple', 'ape']
30>>> import keyword
31>>> get_close_matches('wheel', keyword.kwlist)
32['while']
33>>> get_close_matches('apple', keyword.kwlist)
34[]
35>>> get_close_matches('accept', keyword.kwlist)
36['except']
37\end{verbatim}
38\end{funcdesc}
39
40\begin{classdesc}{SequenceMatcher}{\unspecified}
41 This is a flexible class for comparing pairs of sequences of any
42 type, so long as the sequence elements are hashable. The basic
43 algorithm predates, and is a little fancier than, an algorithm
44 published in the late 1980's by Ratcliff and Obershelp under the
45 hyperbolic name ``gestalt pattern matching.'' The idea is to find
46 the longest contiguous matching subsequence that contains no
47 ``junk'' elements (the Ratcliff and Obershelp algorithm doesn't
48 address junk). The same idea is then applied recursively to the
49 pieces of the sequences to the left and to the right of the matching
50 subsequence. This does not yield minimal edit sequences, but does
51 tend to yield matches that ``look right'' to people.
52
53 \strong{Timing:} The basic Ratcliff-Obershelp algorithm is cubic
54 time in the worst case and quadratic time in the expected case.
55 \class{SequenceMatcher} is quadratic time for the worst case and has
Tim Peters754ba582001-02-20 11:24:35 +000056 expected-case behavior dependent in a complicated way on how many
57 elements the sequences have in common; best case time is linear.
Fred Drakebaf71422001-02-19 16:31:02 +000058\end{classdesc}
59
60
61\subsection{SequenceMatcher Objects \label{sequence-matcher}}
62
63\begin{classdesc}{SequenceMatcher}{\optional{isjunk\optional{,
64 a\optional{, b}}}}
65 Optional argument \var{isjunk} must be \code{None} (the default) or
66 a one-argument function that takes a sequence element and returns
67 true if and only if the element is ``junk'' and should be ignored.
68 \code{None} is equivalent to passing \code{lambda x: 0}, i.e.\ no
69 elements are ignored. For example, pass
70
Tim Peters754ba582001-02-20 11:24:35 +000071% BUG: the HTML generated for this is
72% BUG: lambda x: x in " \\t"
73% BUG: i.e. it displays two backslashes.
Fred Drakebaf71422001-02-19 16:31:02 +000074\begin{verbatim}
75lambda x: x in " \\t"
76\end{verbatim}
77
78 if you're comparing lines as sequences of characters, and don't want
79 to synch up on blanks or hard tabs.
80
81 The optional arguments \var{a} and \var{b} are sequences to be
82 compared; both default to empty strings. The elements of both
83 sequences must be hashable.
84\end{classdesc}
85
86
87\class{SequenceMatcher} objects have the following methods:
88
89\begin{methoddesc}{set_seqs}{a, b}
90 Set the two sequences to be compared.
91\end{methoddesc}
92
93\class{SequenceMatcher} computes and caches detailed information about
94the second sequence, so if you want to compare one sequence against
95many sequences, use \method{set_seq2()} to set the commonly used
96sequence once and call \method{set_seq1()} repeatedly, once for each
97of the other sequences.
98
99\begin{methoddesc}{set_seq1}{a}
100 Set the first sequence to be compared. The second sequence to be
101 compared is not changed.
102\end{methoddesc}
103
104\begin{methoddesc}{set_seq2}{b}
105 Set the second sequence to be compared. The first sequence to be
106 compared is not changed.
107\end{methoddesc}
108
109\begin{methoddesc}{find_longest_match}{alo, ahi, blo, bhi}
110 Find longest matching block in \code{\var{a}[\var{alo}:\var{ahi}]}
111 and \code{\var{b}[\var{blo}:\var{bhi}]}.
112
113 If \var{isjunk} was omitted or \code{None},
114 \method{get_longest_match()} returns \code{(\var{i}, \var{j},
115 \var{k})} such that \code{\var{a}[\var{i}:\var{i}+\var{k}]} is equal
116 to \code{\var{b}[\var{j}:\var{j}+\var{k}]}, where
117 \code{\var{alo} <= \var{i} <= \var{i}+\var{k} <= \var{ahi}} and
118 \code{\var{blo} <= \var{j} <= \var{j}+\var{k} <= \var{bhi}}.
119 For all \code{(\var{i'}, \var{j'}, \var{k'})} meeting those
120 conditions, the additional conditions
121 \code{\var{k} >= \var{k'}},
122 \code{\var{i} <= \var{i'}},
123 and if \code{\var{i} == \var{i'}}, \code{\var{j} <= \var{j'}}
124 are also met.
125 In other words, of all maximal matching blocks, return one that
126 starts earliest in \var{a}, and of all those maximal matching blocks
127 that start earliest in \var{a}, return the one that starts earliest
128 in \var{b}.
129
130\begin{verbatim}
131>>> s = SequenceMatcher(None, " abcd", "abcd abcd")
132>>> s.find_longest_match(0, 5, 0, 9)
133(0, 4, 5)
134\end{verbatim}
135
136 If \var{isjunk} was provided, first the longest matching block is
137 determined as above, but with the additional restriction that no
138 junk element appears in the block. Then that block is extended as
139 far as possible by matching (only) junk elements on both sides.
140 So the resulting block never matches on junk except as identical
141 junk happens to be adjacent to an interesting match.
142
143 Here's the same example as before, but considering blanks to be junk.
Tim Peters754ba582001-02-20 11:24:35 +0000144 That prevents \code{' abcd'} from matching the \code{' abcd'} at the
Fred Drakebaf71422001-02-19 16:31:02 +0000145 tail end of the second sequence directly. Instead only the
146 \code{'abcd'} can match, and matches the leftmost \code{'abcd'} in
147 the second sequence:
148
149\begin{verbatim}
150>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
151>>> s.find_longest_match(0, 5, 0, 9)
152(1, 0, 4)
153\end{verbatim}
154
155 If no blocks match, this returns \code{(\var{alo}, \var{blo}, 0)}.
156\end{methoddesc}
157
158\begin{methoddesc}{get_matching_blocks}{}
159 Return list of triples describing matching subsequences.
160 Each triple is of the form \code{(\var{i}, \var{j}, \var{n})}, and
161 means that \code{\var{a}[\var{i}:\var{i}+\var{n}] ==
162 \var{b}[\var{j}:\var{j}+\var{n}]}. The triples are monotonically
163 increasing in \var{i} and \var{j}.
164
165 The last triple is a dummy, and has the value \code{(len(\var{a}),
166 len(\var{b}), 0)}. It is the only triple with \code{\var{n} == 0}.
167 % Explain why a dummy is used!
168
169\begin{verbatim}
170>>> s = SequenceMatcher(None, "abxcd", "abcd")
171>>> s.get_matching_blocks()
172[(0, 0, 2), (3, 2, 2), (5, 4, 0)]
173\end{verbatim}
174\end{methoddesc}
175
176\begin{methoddesc}{get_opcodes}{}
177 Return list of 5-tuples describing how to turn \var{a} into \var{b}.
178 Each tuple is of the form \code{(\var{tag}, \var{i1}, \var{i2},
179 \var{j1}, \var{j2})}. The first tuple has \code{\var{i1} ==
180 \var{j1} == 0}, and remaining tuples have \var{i1} equal to the
181 \var{i2} from the preceeding tuple, and, likewise, \var{j1} equal to
182 the previous \var{j2}.
183
184 The \var{tag} values are strings, with these meanings:
185
186\begin{tableii}{l|l}{code}{Value}{Meaning}
187 \lineii{'replace'}{\code{\var{a}[\var{i1}:\var{i2}]} should be
188 replaced by \code{\var{b}[\var{j1}:\var{j2}]}.}
189 \lineii{'delete'}{\code{\var{a}[\var{i1}:\var{i2}]} should be
190 deleted. Note that \code{\var{j1} == \var{j2}} in
191 this case.}
192 \lineii{'insert'}{\code{\var{b}[\var{j1}:\var{j2}]} should be
193 inserted at \code{\var{a}[\var{i1}:\var{i1}]}.
194 Note that \code{\var{i1} == \var{i2}} in this
195 case.}
196 \lineii{'equal'}{\code{\var{a}[\var{i1}:\var{i2}] ==
197 \var{b}[\var{j1}:\var{j2}]} (the sub-sequences are
198 equal).}
199\end{tableii}
200
201For example:
202
203\begin{verbatim}
204>>> a = "qabxcd"
205>>> b = "abycdf"
206>>> s = SequenceMatcher(None, a, b)
207>>> for tag, i1, i2, j1, j2 in s.get_opcodes():
208... print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
209... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
210 delete a[0:1] (q) b[0:0] ()
211 equal a[1:3] (ab) b[0:2] (ab)
212replace a[3:4] (x) b[2:3] (y)
213 equal a[4:6] (cd) b[3:5] (cd)
214 insert a[6:6] () b[5:6] (f)
215\end{verbatim}
216\end{methoddesc}
217
218\begin{methoddesc}{ratio}{}
219 Return a measure of the sequences' similarity as a float in the
220 range [0, 1].
221
222 Where T is the total number of elements in both sequences, and M is
Tim Peters754ba582001-02-20 11:24:35 +0000223 the number of matches, this is 2.0*M / T. Note that this is \code{1.}
224 if the sequences are identical, and \code{0.} if they have nothing in
Fred Drakebaf71422001-02-19 16:31:02 +0000225 common.
226
227 This is expensive to compute if \method{get_matching_blocks()} or
228 \method{get_opcodes()} hasn't already been called, in which case you
229 may want to try \method{quick_ratio()} or
230 \method{real_quick_ratio()} first to get an upper bound.
231\end{methoddesc}
232
233\begin{methoddesc}{quick_ratio}{}
234 Return an upper bound on \method{ratio()} relatively quickly.
235
236 This isn't defined beyond that it is an upper bound on
237 \method{ratio()}, and is faster to compute.
238\end{methoddesc}
239
240\begin{methoddesc}{real_quick_ratio}{}
241 Return an upper bound on \method{ratio()} very quickly.
242
243 This isn't defined beyond that it is an upper bound on
244 \method{ratio()}, and is faster to compute than either
245 \method{ratio()} or \method{quick_ratio()}.
246\end{methoddesc}
247
Tim Peters754ba582001-02-20 11:24:35 +0000248The three methods that return the ratio of matching to total characters
249can give different results due to differing levels of approximation,
250although \method{quick_ratio()} and \method{real_quick_ratio()} are always
251at least as large as \method{ratio()}:
Fred Drakebaf71422001-02-19 16:31:02 +0000252
253\begin{verbatim}
254>>> s = SequenceMatcher(None, "abcd", "bcde")
255>>> s.ratio()
2560.75
257>>> s.quick_ratio()
2580.75
259>>> s.real_quick_ratio()
2601.0
261\end{verbatim}
262
263
264\subsection{Examples \label{difflib-examples}}
265
266
267This example compares two strings, considering blanks to be ``junk:''
268
269\begin{verbatim}
270>>> s = SequenceMatcher(lambda x: x == " ",
271... "private Thread currentThread;",
272... "private volatile Thread currentThread;")
273\end{verbatim}
274
275\method{ratio()} returns a float in [0, 1], measuring the similarity
276of the sequences. As a rule of thumb, a \method{ratio()} value over
2770.6 means the sequences are close matches:
278
279\begin{verbatim}
280>>> print round(s.ratio(), 3)
2810.866
282\end{verbatim}
283
284If you're only interested in where the sequences match,
285\method{get_matching_blocks()} is handy:
286
287\begin{verbatim}
288>>> for block in s.get_matching_blocks():
289... print "a[%d] and b[%d] match for %d elements" % block
290a[0] and b[0] match for 8 elements
291a[8] and b[17] match for 6 elements
292a[14] and b[23] match for 15 elements
293a[29] and b[38] match for 0 elements
294\end{verbatim}
295
296Note that the last tuple returned by \method{get_matching_blocks()} is
297always a dummy, \code{(len(\var{a}), len(\var{b}), 0)}, and this is
298the only case in which the last tuple element (number of elements
299matched) is \code{0}.
300
301If you want to know how to change the first sequence into the second,
302use \method{get_opcodes()}:
303
304\begin{verbatim}
305>>> for opcode in s.get_opcodes():
306... print "%6s a[%d:%d] b[%d:%d]" % opcode
307 equal a[0:8] b[0:8]
308insert a[8:8] b[8:17]
309 equal a[8:14] b[17:23]
310 equal a[14:29] b[23:38]
311\end{verbatim}
312
313See \file{Tools/scripts/ndiff.py} from the Python source distribution
314for a fancy human-friendly file differencer, which uses
315\class{SequenceMatcher} both to view files as sequences of lines, and
316lines as sequences of characters.
317
318See also the function \function{get_close_matches()} in this module,
319which shows how simple code building on \class{SequenceMatcher} can be
320used to do useful work.