Blame - Doc/lib/libdifflib.tex - platform/external/python/cpython2

blob: e28b39fdcab6e6ba64afdb68d2ba9b69e2e61d2a [file] [log] [blame]

Fred Drake	baf7142	2001-02-19 16:31:02 +0000	[diff] [blame]	1	\section{\module{difflib} ---
				2	Helpers for computing deltas}
				3
				4	\declaremodule{standard}{difflib}
				5	\modulesynopsis{Helpers for computing differences between objects.}
				6	\moduleauthor{Tim Peters}{tim.one@home.com}
				7	\sectionauthor{Tim Peters}{tim.one@home.com}
				8	% LaTeXification by Fred L. Drake, Jr. <fdrake@acm.org>.
				9
Fred Drake	da00cda	2001-04-10 19:56:09 +0000	[diff] [blame]	10	\versionadded{2.1}
				11
				12
Fred Drake	6943a29	2001-08-13 19:31:59 +0000	[diff] [blame]	13	\begin{classdesc*}{SequenceMatcher}
				14	This is a flexible class for comparing pairs of sequences of any
				15	type, so long as the sequence elements are hashable. The basic
				16	algorithm predates, and is a little fancier than, an algorithm
				17	published in the late 1980's by Ratcliff and Obershelp under the
				18	hyperbolic name ``gestalt pattern matching.'' The idea is to find
				19	the longest contiguous matching subsequence that contains no
				20	``junk'' elements (the Ratcliff and Obershelp algorithm doesn't
				21	address junk). The same idea is then applied recursively to the
				22	pieces of the sequences to the left and to the right of the matching
				23	subsequence. This does not yield minimal edit sequences, but does
				24	tend to yield matches that ``look right'' to people.
				25
				26	\strong{Timing:} The basic Ratcliff-Obershelp algorithm is cubic
				27	time in the worst case and quadratic time in the expected case.
				28	\class{SequenceMatcher} is quadratic time for the worst case and has
				29	expected-case behavior dependent in a complicated way on how many
				30	elements the sequences have in common; best case time is linear.
				31	\end{classdesc*}
				32
				33	\begin{classdesc*}{Differ}
				34	This is a class for comparing sequences of lines of text, and
Tim Peters	8a9c284	2001-09-22 21:30:22 +0000	[diff] [blame]	35	producing human-readable differences or deltas. Differ uses
Fred Drake	6943a29	2001-08-13 19:31:59 +0000	[diff] [blame]	36	\class{SequenceMatcher} both to compare sequences of lines, and to
				37	compare sequences of characters within similar (near-matching)
				38	lines.
				39
				40	Each line of a \class{Differ} delta begins with a two-letter code:
				41
				42	\begin{tableii}{l\|l}{code}{Code}{Meaning}
				43	\lineii{'- '}{line unique to sequence 1}
				44	\lineii{'+ '}{line unique to sequence 2}
				45	\lineii{' '}{line common to both sequences}
				46	\lineii{'? '}{line not present in either input sequence}
				47	\end{tableii}
				48
				49	Lines beginning with `\code{?~}' attempt to guide the eye to
				50	intraline differences, and were not present in either input
				51	sequence. These lines can be confusing if the sequences contain tab
				52	characters.
				53	\end{classdesc*}
				54
Raymond Hettinger	e07b835	2003-06-09 21:44:59 +0000	[diff] [blame^]	55	\begin{funcdesc}{context_diff}{a, b\optional{, fromfile\optional{, tofile
				56	\optional{, fromfiledate\optional{, tofiledate\optional{, n
				57	\optional{, lineterm}}}}}}}
				58
				59	Compare \var{a} and \var{b} (lists of strings); return a
				60	delta (a generator generating the delta lines) in context diff
				61	format.
				62
				63	Context diffs are a compact way of showing just the lines that have
				64	changed plus a few lines of context. The changes are shown in a
				65	before/after style. The number of context lines is set by \var{n}
				66	which defaults to three.
				67
				68	By default, the diff control lines (those with \code{***} or \code{---})
				69	are created with a trailing newline. This is helpful so that inputs created
				70	from \function{file.readlines()} result in diffs that are suitable for use
				71	with \function{file.writelines()} since both the inputs and outputs have
				72	trailing newlines.
				73
				74	For inputs that do not have trailing newlines, set the \var{lineterm}
				75	argument to \code{""} so that the output will be uniformly newline free.
				76
				77	The context diff format normally has a header for filenames and
				78	modification times. Any or all of these may be specified using strings for
				79	\var{fromfile}, \var{tofile}, \var{fromfiledate}, and \var{tofiledate}.
				80	The modification times are normally expressed in the format returned by
				81	\function{time.ctime()}. If not specified, the strings default to blanks.
				82
				83	\file{Tools/scripts/diff.py} is a command-line front-end for this
				84	function.
				85	\end{funcdesc}
				86
Fred Drake	baf7142	2001-02-19 16:31:02 +0000	[diff] [blame]	87	\begin{funcdesc}{get_close_matches}{word, possibilities\optional{,
				88	n\optional{, cutoff}}}
				89	Return a list of the best ``good enough'' matches. \var{word} is a
				90	sequence for which close matches are desired (typically a string),
				91	and \var{possibilities} is a list of sequences against which to
				92	match \var{word} (typically a list of strings).
				93
				94	Optional argument \var{n} (default \code{3}) is the maximum number
				95	of close matches to return; \var{n} must be greater than \code{0}.
				96
				97	Optional argument \var{cutoff} (default \code{0.6}) is a float in
				98	the range [0, 1]. Possibilities that don't score at least that
				99	similar to \var{word} are ignored.
				100
				101	The best (no more than \var{n}) matches among the possibilities are
				102	returned in a list, sorted by similarity score, most similar first.
				103
				104	\begin{verbatim}
				105	>>> get_close_matches('appel', ['ape', 'apple', 'peach', 'puppy'])
				106	['apple', 'ape']
				107	>>> import keyword
				108	>>> get_close_matches('wheel', keyword.kwlist)
				109	['while']
				110	>>> get_close_matches('apple', keyword.kwlist)
				111	[]
				112	>>> get_close_matches('accept', keyword.kwlist)
				113	['except']
				114	\end{verbatim}
				115	\end{funcdesc}
				116
Fred Drake	6943a29	2001-08-13 19:31:59 +0000	[diff] [blame]	117	\begin{funcdesc}{ndiff}{a, b\optional{, linejunk\optional{,
				118	charjunk}}}
				119	Compare \var{a} and \var{b} (lists of strings); return a
Tim Peters	8a9c284	2001-09-22 21:30:22 +0000	[diff] [blame]	120	\class{Differ}-style delta (a generator generating the delta lines).
Fred Drake	baf7142	2001-02-19 16:31:02 +0000	[diff] [blame]	121
Fred Drake	6943a29	2001-08-13 19:31:59 +0000	[diff] [blame]	122	Optional keyword parameters \var{linejunk} and \var{charjunk} are
				123	for filter functions (or \code{None}):
				124
Tim Peters	81b9251	2002-04-29 01:37:32 +0000	[diff] [blame]	125	\var{linejunk}: A function that accepts a single string
				126	argument, and returns true if the string is junk, or false if not.
				127	The default is (\code{None}), starting with Python 2.3. Before then,
				128	the default was the module-level function
Fred Drake	6943a29	2001-08-13 19:31:59 +0000	[diff] [blame]	129	\function{IS_LINE_JUNK()}, which filters out lines without visible
				130	characters, except for at most one pound character (\character{\#}).
Tim Peters	81b9251	2002-04-29 01:37:32 +0000	[diff] [blame]	131	As of Python 2.3, the underlying \class{SequenceMatcher} class
				132	does a dynamic analysis of which lines are so frequent as to
				133	constitute noise, and this usually works better than the pre-2.3
				134	default.
Fred Drake	6943a29	2001-08-13 19:31:59 +0000	[diff] [blame]	135
Tim Peters	81b9251	2002-04-29 01:37:32 +0000	[diff] [blame]	136	\var{charjunk}: A function that accepts a character (a string of
				137	length 1), and returns if the character is junk, or false if not.
Fred Drake	6943a29	2001-08-13 19:31:59 +0000	[diff] [blame]	138	The default is module-level function \function{IS_CHARACTER_JUNK()},
				139	which filters out whitespace characters (a blank or tab; note: bad
				140	idea to include newline in this!).
				141
				142	\file{Tools/scripts/ndiff.py} is a command-line front-end to this
				143	function.
				144
				145	\begin{verbatim}
				146	>>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
Fred Drake	edb635f	2002-12-06 18:52:28 +0000	[diff] [blame]	147	... 'ore\ntree\nemu\n'.splitlines(1))
Fred Drake	6943a29	2001-08-13 19:31:59 +0000	[diff] [blame]	148	>>> print ''.join(diff),
				149	- one
Tim Peters	8a9c284	2001-09-22 21:30:22 +0000	[diff] [blame]	150	? ^
Fred Drake	6943a29	2001-08-13 19:31:59 +0000	[diff] [blame]	151	+ ore
Tim Peters	8a9c284	2001-09-22 21:30:22 +0000	[diff] [blame]	152	? ^
Fred Drake	6943a29	2001-08-13 19:31:59 +0000	[diff] [blame]	153	- two
				154	- three
Tim Peters	8a9c284	2001-09-22 21:30:22 +0000	[diff] [blame]	155	? -
Fred Drake	6943a29	2001-08-13 19:31:59 +0000	[diff] [blame]	156	+ tree
				157	+ emu
				158	\end{verbatim}
				159	\end{funcdesc}
				160
				161	\begin{funcdesc}{restore}{sequence, which}
				162	Return one of the two sequences that generated a delta.
				163
				164	Given a \var{sequence} produced by \method{Differ.compare()} or
				165	\function{ndiff()}, extract lines originating from file 1 or 2
				166	(parameter \var{which}), stripping off line prefixes.
				167
				168	Example:
				169
				170	\begin{verbatim}
				171	>>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
				172	... 'ore\ntree\nemu\n'.splitlines(1))
Tim Peters	8a9c284	2001-09-22 21:30:22 +0000	[diff] [blame]	173	>>> diff = list(diff) # materialize the generated delta into a list
Fred Drake	6943a29	2001-08-13 19:31:59 +0000	[diff] [blame]	174	>>> print ''.join(restore(diff, 1)),
				175	one
				176	two
				177	three
				178	>>> print ''.join(restore(diff, 2)),
				179	ore
				180	tree
				181	emu
				182	\end{verbatim}
				183
				184	\end{funcdesc}
				185
Raymond Hettinger	e07b835	2003-06-09 21:44:59 +0000	[diff] [blame^]	186	\begin{funcdesc}{unified_diff}{a, b\optional{, fromfile\optional{, tofile
				187	\optional{, fromfiledate\optional{, tofiledate\optional{, n
				188	\optional{, lineterm}}}}}}}
				189
				190	Compare \var{a} and \var{b} (lists of strings); return a
				191	delta (a generator generating the delta lines) in unified diff
				192	format.
				193
				194	Unified diffs are a compact way of showing just the lines that have
				195	changed plus a few lines of context. The changes are shown in a
				196	inline style (instead of separate before/after blocks). The number
				197	of context lines is set by \var{n} which defaults to three.
				198
				199	By default, the diff control lines (those with \code{---}, \code{+++},
				200	or \code{@@}) are created with a trailing newline. This is helpful so
				201	that inputs created from \function{file.readlines()} result in diffs
				202	that are suitable for use with \function{file.writelines()} since both
				203	the inputs and outputs have trailing newlines.
				204
				205	For inputs that do not have trailing newlines, set the \var{lineterm}
				206	argument to \code{""} so that the output will be uniformly newline free.
				207
				208	The context diff format normally has a header for filenames and
				209	modification times. Any or all of these may be specified using strings for
				210	\var{fromfile}, \var{tofile}, \var{fromfiledate}, and \var{tofiledate}.
				211	The modification times are normally expressed in the format returned by
				212	\function{time.ctime()}. If not specified, the strings default to blanks.
				213
				214	\file{Tools/scripts/diff.py} is a command-line front-end for this
				215	function.
				216	\end{funcdesc}
Fred Drake	6943a29	2001-08-13 19:31:59 +0000	[diff] [blame]	217
Fred Drake	7f10cce	2001-10-26 03:04:23 +0000	[diff] [blame]	218	\begin{funcdesc}{IS_LINE_JUNK}{line}
				219	Return true for ignorable lines. The line \var{line} is ignorable
				220	if \var{line} is blank or contains a single \character{\#},
				221	otherwise it is not ignorable. Used as a default for parameter
Tim Peters	81b9251	2002-04-29 01:37:32 +0000	[diff] [blame]	222	\var{linejunk} in \function{ndiff()} before Python 2.3.
Fred Drake	6943a29	2001-08-13 19:31:59 +0000	[diff] [blame]	223	\end{funcdesc}
				224
				225
Fred Drake	7f10cce	2001-10-26 03:04:23 +0000	[diff] [blame]	226	\begin{funcdesc}{IS_CHARACTER_JUNK}{ch}
				227	Return true for ignorable characters. The character \var{ch} is
				228	ignorable if \var{ch} is a space or tab, otherwise it is not
				229	ignorable. Used as a default for parameter \var{charjunk} in
Fred Drake	6943a29	2001-08-13 19:31:59 +0000	[diff] [blame]	230	\function{ndiff()}.
Fred Drake	6943a29	2001-08-13 19:31:59 +0000	[diff] [blame]	231	\end{funcdesc}
Fred Drake	baf7142	2001-02-19 16:31:02 +0000	[diff] [blame]	232
				233
Fred Drake	6fda3ac	2001-04-10 18:41:16 +0000	[diff] [blame]	234	\begin{seealso}
				235	\seetitle{Pattern Matching: The Gestalt Approach}{Discussion of a
				236	similar algorithm by John W. Ratcliff and D. E. Metzener.
				237	This was published in
				238	\citetitle[http://www.ddj.com/]{Dr. Dobb's Journal} in
				239	July, 1988.}
				240	\end{seealso}
				241
				242
Fred Drake	baf7142	2001-02-19 16:31:02 +0000	[diff] [blame]	243	\subsection{SequenceMatcher Objects \label{sequence-matcher}}
				244
Fred Drake	96d7a70	2001-05-11 01:08:13 +0000	[diff] [blame]	245	The \class{SequenceMatcher} class has this constructor:
				246
Fred Drake	baf7142	2001-02-19 16:31:02 +0000	[diff] [blame]	247	\begin{classdesc}{SequenceMatcher}{\optional{isjunk\optional{,
				248	a\optional{, b}}}}
				249	Optional argument \var{isjunk} must be \code{None} (the default) or
				250	a one-argument function that takes a sequence element and returns
				251	true if and only if the element is ``junk'' and should be ignored.
Fred Drake	7f10cce	2001-10-26 03:04:23 +0000	[diff] [blame]	252	Passing \code{None} for \var{b} is equivalent to passing
				253	\code{lambda x: 0}; in other words, no elements are ignored. For
				254	example, pass:
Fred Drake	baf7142	2001-02-19 16:31:02 +0000	[diff] [blame]	255
				256	\begin{verbatim}
Fred Drake	447f545	2001-02-23 19:13:07 +0000	[diff] [blame]	257	lambda x: x in " \t"
Fred Drake	baf7142	2001-02-19 16:31:02 +0000	[diff] [blame]	258	\end{verbatim}
				259
				260	if you're comparing lines as sequences of characters, and don't want
				261	to synch up on blanks or hard tabs.
				262
				263	The optional arguments \var{a} and \var{b} are sequences to be
				264	compared; both default to empty strings. The elements of both
				265	sequences must be hashable.
				266	\end{classdesc}
				267
				268
				269	\class{SequenceMatcher} objects have the following methods:
				270
				271	\begin{methoddesc}{set_seqs}{a, b}
				272	Set the two sequences to be compared.
				273	\end{methoddesc}
				274
				275	\class{SequenceMatcher} computes and caches detailed information about
				276	the second sequence, so if you want to compare one sequence against
				277	many sequences, use \method{set_seq2()} to set the commonly used
				278	sequence once and call \method{set_seq1()} repeatedly, once for each
				279	of the other sequences.
				280
				281	\begin{methoddesc}{set_seq1}{a}
				282	Set the first sequence to be compared. The second sequence to be
				283	compared is not changed.
				284	\end{methoddesc}
				285
				286	\begin{methoddesc}{set_seq2}{b}
				287	Set the second sequence to be compared. The first sequence to be
				288	compared is not changed.
				289	\end{methoddesc}
				290
				291	\begin{methoddesc}{find_longest_match}{alo, ahi, blo, bhi}
				292	Find longest matching block in \code{\var{a}[\var{alo}:\var{ahi}]}
				293	and \code{\var{b}[\var{blo}:\var{bhi}]}.
				294
				295	If \var{isjunk} was omitted or \code{None},
				296	\method{get_longest_match()} returns \code{(\var{i}, \var{j},
				297	\var{k})} such that \code{\var{a}[\var{i}:\var{i}+\var{k}]} is equal
Tim Peters	8a9c284	2001-09-22 21:30:22 +0000	[diff] [blame]	298	to \code{\var{b}[\var{j}:\var{j}+\var{k}]}, where
Fred Drake	baf7142	2001-02-19 16:31:02 +0000	[diff] [blame]	299	\code{\var{alo} <= \var{i} <= \var{i}+\var{k} <= \var{ahi}} and
				300	\code{\var{blo} <= \var{j} <= \var{j}+\var{k} <= \var{bhi}}.
				301	For all \code{(\var{i'}, \var{j'}, \var{k'})} meeting those
				302	conditions, the additional conditions
				303	\code{\var{k} >= \var{k'}},
				304	\code{\var{i} <= \var{i'}},
				305	and if \code{\var{i} == \var{i'}}, \code{\var{j} <= \var{j'}}
				306	are also met.
				307	In other words, of all maximal matching blocks, return one that
				308	starts earliest in \var{a}, and of all those maximal matching blocks
				309	that start earliest in \var{a}, return the one that starts earliest
				310	in \var{b}.
				311
				312	\begin{verbatim}
				313	>>> s = SequenceMatcher(None, " abcd", "abcd abcd")
				314	>>> s.find_longest_match(0, 5, 0, 9)
				315	(0, 4, 5)
				316	\end{verbatim}
				317
				318	If \var{isjunk} was provided, first the longest matching block is
				319	determined as above, but with the additional restriction that no
				320	junk element appears in the block. Then that block is extended as
				321	far as possible by matching (only) junk elements on both sides.
				322	So the resulting block never matches on junk except as identical
				323	junk happens to be adjacent to an interesting match.
				324
				325	Here's the same example as before, but considering blanks to be junk.
Tim Peters	754ba58	2001-02-20 11:24:35 +0000	[diff] [blame]	326	That prevents \code{' abcd'} from matching the \code{' abcd'} at the
Fred Drake	baf7142	2001-02-19 16:31:02 +0000	[diff] [blame]	327	tail end of the second sequence directly. Instead only the
				328	\code{'abcd'} can match, and matches the leftmost \code{'abcd'} in
				329	the second sequence:
				330
				331	\begin{verbatim}
				332	>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
				333	>>> s.find_longest_match(0, 5, 0, 9)
				334	(1, 0, 4)
				335	\end{verbatim}
				336
				337	If no blocks match, this returns \code{(\var{alo}, \var{blo}, 0)}.
				338	\end{methoddesc}
				339
				340	\begin{methoddesc}{get_matching_blocks}{}
				341	Return list of triples describing matching subsequences.
				342	Each triple is of the form \code{(\var{i}, \var{j}, \var{n})}, and
				343	means that \code{\var{a}[\var{i}:\var{i}+\var{n}] ==
				344	\var{b}[\var{j}:\var{j}+\var{n}]}. The triples are monotonically
				345	increasing in \var{i} and \var{j}.
				346
				347	The last triple is a dummy, and has the value \code{(len(\var{a}),
				348	len(\var{b}), 0)}. It is the only triple with \code{\var{n} == 0}.
				349	% Explain why a dummy is used!
				350
				351	\begin{verbatim}
				352	>>> s = SequenceMatcher(None, "abxcd", "abcd")
				353	>>> s.get_matching_blocks()
				354	[(0, 0, 2), (3, 2, 2), (5, 4, 0)]
				355	\end{verbatim}
				356	\end{methoddesc}
				357
				358	\begin{methoddesc}{get_opcodes}{}
				359	Return list of 5-tuples describing how to turn \var{a} into \var{b}.
				360	Each tuple is of the form \code{(\var{tag}, \var{i1}, \var{i2},
				361	\var{j1}, \var{j2})}. The first tuple has \code{\var{i1} ==
				362	\var{j1} == 0}, and remaining tuples have \var{i1} equal to the
				363	\var{i2} from the preceeding tuple, and, likewise, \var{j1} equal to
				364	the previous \var{j2}.
				365
				366	The \var{tag} values are strings, with these meanings:
				367
				368	\begin{tableii}{l\|l}{code}{Value}{Meaning}
				369	\lineii{'replace'}{\code{\var{a}[\var{i1}:\var{i2}]} should be
				370	replaced by \code{\var{b}[\var{j1}:\var{j2}]}.}
				371	\lineii{'delete'}{\code{\var{a}[\var{i1}:\var{i2}]} should be
				372	deleted. Note that \code{\var{j1} == \var{j2}} in
				373	this case.}
				374	\lineii{'insert'}{\code{\var{b}[\var{j1}:\var{j2}]} should be
Tim Peters	8a9c284	2001-09-22 21:30:22 +0000	[diff] [blame]	375	inserted at \code{\var{a}[\var{i1}:\var{i1}]}.
Fred Drake	baf7142	2001-02-19 16:31:02 +0000	[diff] [blame]	376	Note that \code{\var{i1} == \var{i2}} in this
				377	case.}
				378	\lineii{'equal'}{\code{\var{a}[\var{i1}:\var{i2}] ==
				379	\var{b}[\var{j1}:\var{j2}]} (the sub-sequences are
				380	equal).}
				381	\end{tableii}
				382
				383	For example:
				384
				385	\begin{verbatim}
				386	>>> a = "qabxcd"
				387	>>> b = "abycdf"
				388	>>> s = SequenceMatcher(None, a, b)
				389	>>> for tag, i1, i2, j1, j2 in s.get_opcodes():
				390	... print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
				391	... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
				392	delete a[0:1] (q) b[0:0] ()
				393	equal a[1:3] (ab) b[0:2] (ab)
				394	replace a[3:4] (x) b[2:3] (y)
				395	equal a[4:6] (cd) b[3:5] (cd)
				396	insert a[6:6] () b[5:6] (f)
				397	\end{verbatim}
				398	\end{methoddesc}
				399
				400	\begin{methoddesc}{ratio}{}
				401	Return a measure of the sequences' similarity as a float in the
				402	range [0, 1].
				403
				404	Where T is the total number of elements in both sequences, and M is
Fred Drake	6943a29	2001-08-13 19:31:59 +0000	[diff] [blame]	405	the number of matches, this is 2.0*M / T. Note that this is
				406	\code{1.0} if the sequences are identical, and \code{0.0} if they
				407	have nothing in common.
Fred Drake	baf7142	2001-02-19 16:31:02 +0000	[diff] [blame]	408
				409	This is expensive to compute if \method{get_matching_blocks()} or
				410	\method{get_opcodes()} hasn't already been called, in which case you
				411	may want to try \method{quick_ratio()} or
				412	\method{real_quick_ratio()} first to get an upper bound.
				413	\end{methoddesc}
				414
				415	\begin{methoddesc}{quick_ratio}{}
				416	Return an upper bound on \method{ratio()} relatively quickly.
				417
				418	This isn't defined beyond that it is an upper bound on
				419	\method{ratio()}, and is faster to compute.
				420	\end{methoddesc}
				421
				422	\begin{methoddesc}{real_quick_ratio}{}
				423	Return an upper bound on \method{ratio()} very quickly.
				424
				425	This isn't defined beyond that it is an upper bound on
				426	\method{ratio()}, and is faster to compute than either
				427	\method{ratio()} or \method{quick_ratio()}.
				428	\end{methoddesc}
				429
Tim Peters	754ba58	2001-02-20 11:24:35 +0000	[diff] [blame]	430	The three methods that return the ratio of matching to total characters
				431	can give different results due to differing levels of approximation,
				432	although \method{quick_ratio()} and \method{real_quick_ratio()} are always
				433	at least as large as \method{ratio()}:
Fred Drake	baf7142	2001-02-19 16:31:02 +0000	[diff] [blame]	434
				435	\begin{verbatim}
				436	>>> s = SequenceMatcher(None, "abcd", "bcde")
				437	>>> s.ratio()
				438	0.75
				439	>>> s.quick_ratio()
				440	0.75
				441	>>> s.real_quick_ratio()
				442	1.0
				443	\end{verbatim}
				444
				445
Fred Drake	6943a29	2001-08-13 19:31:59 +0000	[diff] [blame]	446	\subsection{SequenceMatcher Examples \label{sequencematcher-examples}}
Fred Drake	baf7142	2001-02-19 16:31:02 +0000	[diff] [blame]	447
				448
				449	This example compares two strings, considering blanks to be ``junk:''
				450
				451	\begin{verbatim}
				452	>>> s = SequenceMatcher(lambda x: x == " ",
				453	... "private Thread currentThread;",
				454	... "private volatile Thread currentThread;")
				455	\end{verbatim}
				456
				457	\method{ratio()} returns a float in [0, 1], measuring the similarity
				458	of the sequences. As a rule of thumb, a \method{ratio()} value over
				459	0.6 means the sequences are close matches:
				460
				461	\begin{verbatim}
				462	>>> print round(s.ratio(), 3)
				463	0.866
				464	\end{verbatim}
				465
				466	If you're only interested in where the sequences match,
				467	\method{get_matching_blocks()} is handy:
				468
				469	\begin{verbatim}
				470	>>> for block in s.get_matching_blocks():
				471	... print "a[%d] and b[%d] match for %d elements" % block
				472	a[0] and b[0] match for 8 elements
				473	a[8] and b[17] match for 6 elements
				474	a[14] and b[23] match for 15 elements
				475	a[29] and b[38] match for 0 elements
				476	\end{verbatim}
				477
				478	Note that the last tuple returned by \method{get_matching_blocks()} is
				479	always a dummy, \code{(len(\var{a}), len(\var{b}), 0)}, and this is
				480	the only case in which the last tuple element (number of elements
				481	matched) is \code{0}.
				482
				483	If you want to know how to change the first sequence into the second,
				484	use \method{get_opcodes()}:
				485
				486	\begin{verbatim}
				487	>>> for opcode in s.get_opcodes():
				488	... print "%6s a[%d:%d] b[%d:%d]" % opcode
				489	equal a[0:8] b[0:8]
				490	insert a[8:8] b[8:17]
				491	equal a[8:14] b[17:23]
				492	equal a[14:29] b[23:38]
				493	\end{verbatim}
				494
Fred Drake	baf7142	2001-02-19 16:31:02 +0000	[diff] [blame]	495	See also the function \function{get_close_matches()} in this module,
				496	which shows how simple code building on \class{SequenceMatcher} can be
				497	used to do useful work.
Fred Drake	6943a29	2001-08-13 19:31:59 +0000	[diff] [blame]	498
				499
				500	\subsection{Differ Objects \label{differ-objects}}
				501
				502	Note that \class{Differ}-generated deltas make no claim to be
				503	\strong{minimal} diffs. To the contrary, minimal diffs are often
				504	counter-intuitive, because they synch up anywhere possible, sometimes
				505	accidental matches 100 pages apart. Restricting synch points to
				506	contiguous matches preserves some notion of locality, at the
				507	occasional cost of producing a longer diff.
				508
				509	The \class{Differ} class has this constructor:
				510
				511	\begin{classdesc}{Differ}{\optional{linejunk\optional{, charjunk}}}
				512	Optional keyword parameters \var{linejunk} and \var{charjunk} are
				513	for filter functions (or \code{None}):
				514
Tim Peters	81b9251	2002-04-29 01:37:32 +0000	[diff] [blame]	515	\var{linejunk}: A function that accepts a single string
				516	argument, and returns true if the string is junk. The default is
				517	\code{None}, meaning that no line is considered junk.
Fred Drake	6943a29	2001-08-13 19:31:59 +0000	[diff] [blame]	518
Tim Peters	81b9251	2002-04-29 01:37:32 +0000	[diff] [blame]	519	\var{charjunk}: A function that accepts a single character argument
				520	(a string of length 1), and returns true if the character is junk.
				521	The default is \code{None}, meaning that no character is
				522	considered junk.
Fred Drake	6943a29	2001-08-13 19:31:59 +0000	[diff] [blame]	523	\end{classdesc}
				524
				525	\class{Differ} objects are used (deltas generated) via a single
				526	method:
				527
				528	\begin{methoddesc}{compare}{a, b}
Tim Peters	8a9c284	2001-09-22 21:30:22 +0000	[diff] [blame]	529	Compare two sequences of lines, and generate the delta (a sequence
				530	of lines).
Fred Drake	6943a29	2001-08-13 19:31:59 +0000	[diff] [blame]	531
				532	Each sequence must contain individual single-line strings ending
				533	with newlines. Such sequences can be obtained from the
Tim Peters	8a9c284	2001-09-22 21:30:22 +0000	[diff] [blame]	534	\method{readlines()} method of file-like objects. The delta generated
				535	also consists of newline-terminated strings, ready to be printed as-is
Fred Drake	389aa17	2001-11-29 19:04:50 +0000	[diff] [blame]	536	via the \method{writelines()} method of a file-like object.
Fred Drake	6943a29	2001-08-13 19:31:59 +0000	[diff] [blame]	537	\end{methoddesc}
				538
				539
				540	\subsection{Differ Example \label{differ-examples}}
				541
				542	This example compares two texts. First we set up the texts, sequences
				543	of individual single-line strings ending with newlines (such sequences
				544	can also be obtained from the \method{readlines()} method of file-like
				545	objects):
				546
				547	\begin{verbatim}
				548	>>> text1 = ''' 1. Beautiful is better than ugly.
				549	... 2. Explicit is better than implicit.
				550	... 3. Simple is better than complex.
				551	... 4. Complex is better than complicated.
				552	... '''.splitlines(1)
				553	>>> len(text1)
				554	4
				555	>>> text1[0][-1]
				556	'\n'
				557	>>> text2 = ''' 1. Beautiful is better than ugly.
				558	... 3. Simple is better than complex.
				559	... 4. Complicated is better than complex.
				560	... 5. Flat is better than nested.
				561	... '''.splitlines(1)
				562	\end{verbatim}
				563
				564	Next we instantiate a Differ object:
				565
				566	\begin{verbatim}
				567	>>> d = Differ()
				568	\end{verbatim}
				569
				570	Note that when instantiating a \class{Differ} object we may pass
				571	functions to filter out line and character ``junk.'' See the
				572	\method{Differ()} constructor for details.
				573
				574	Finally, we compare the two:
				575
				576	\begin{verbatim}
Tim Peters	8a9c284	2001-09-22 21:30:22 +0000	[diff] [blame]	577	>>> result = list(d.compare(text1, text2))
Fred Drake	6943a29	2001-08-13 19:31:59 +0000	[diff] [blame]	578	\end{verbatim}
				579
				580	\code{result} is a list of strings, so let's pretty-print it:
				581
				582	\begin{verbatim}
				583	>>> from pprint import pprint
				584	>>> pprint(result)
				585	[' 1. Beautiful is better than ugly.\n',
				586	'- 2. Explicit is better than implicit.\n',
				587	'- 3. Simple is better than complex.\n',
				588	'+ 3. Simple is better than complex.\n',
				589	'? ++ \n',
				590	'- 4. Complex is better than complicated.\n',
				591	'? ^ ---- ^ \n',
				592	'+ 4. Complicated is better than complex.\n',
				593	'? ++++ ^ ^ \n',
				594	'+ 5. Flat is better than nested.\n']
				595	\end{verbatim}
				596
				597	As a single multi-line string it looks like this:
				598
				599	\begin{verbatim}
				600	>>> import sys
				601	>>> sys.stdout.writelines(result)
				602	1. Beautiful is better than ugly.
				603	- 2. Explicit is better than implicit.
				604	- 3. Simple is better than complex.
				605	+ 3. Simple is better than complex.
				606	? ++
				607	- 4. Complex is better than complicated.
				608	? ^ ---- ^
				609	+ 4. Complicated is better than complex.
				610	? ++++ ^ ^
				611	+ 5. Flat is better than nested.
				612	\end{verbatim}