Blame - Tools/scripts/ndiff.py - platform/external/python/cpython3

blob: 3f453af610ccda2909d2b0665593f79fe0868664 [file] [log] [blame]

Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	1	#! /usr/bin/env python
				2
Guido van Rossum	a3433e8	1999-03-27 13:34:01 +0000	[diff] [blame^]	3	# Module ndiff version 1.3.0
				4	# Released to the public domain 26-Mar-1999,
				5	# by Tim Peters (tim_one@email.msn.com).
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	6
Guido van Rossum	a3433e8	1999-03-27 13:34:01 +0000	[diff] [blame^]	7	# Provided as-is; use at your own risk; no warranty; no promises; enjoy!
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	8
Guido van Rossum	a3433e8	1999-03-27 13:34:01 +0000	[diff] [blame^]	9	"""ndiff [-q] file1 file2
				10
				11	Print a human-friendly file difference report to stdout. Both inter-
				12	and intra-line differences are noted.
				13
				14	If -q ("quiet") is not specified, the first two lines of output are
				15
				16	-: file1
				17	+: file2
				18
				19	Each remaining line begins with a two-letter code:
				20
				21	"- " line unique to file1
				22	"+ " line unique to file2
				23	" " line common to both files
				24	"? " line not present in either input file
				25
				26	Lines beginning with "? " attempt to guide the eye to intraline
				27	differences, and were not present in either input file.
				28
				29	The first file can be recovered by retaining only lines that begin with
				30	" " or "- ", and deleting those 2-character prefixes.
				31
				32	The second file can be recovered similarly, but by retaining only " "
				33	and "+ " lines. On Unix, the second file can be recovered by piping the
				34	output through
				35	sed -n '/^[+ ] /s/^..//p'
				36	Modifications to recover the first file are left as an exercise for
				37	the reader.
				38
				39	See module comments for details and programmatic interface.
				40	"""
				41
				42	__version__ = 1, 3, 0
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	43
				44	# SequenceMatcher tries to compute a "human-friendly diff" between
				45	# two sequences (chiefly picturing a file as a sequence of lines,
Guido van Rossum	a3433e8	1999-03-27 13:34:01 +0000	[diff] [blame^]	46	# and a line as a sequence of characters, here). Unlike e.g. UNIX(tm)
				47	# diff, the fundamental notion is the longest contiguous & junk-free
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	48	# matching subsequence. That's what catches peoples' eyes. The
				49	# Windows(tm) windiff has another interesting notion, pairing up elements
				50	# that appear uniquely in each sequence. That, and the method here,
				51	# appear to yield more intuitive difference reports than does diff. This
				52	# method appears to be the least vulnerable to synching up on blocks
				53	# of "junk lines", though (like blank lines in ordinary text files,
				54	# or maybe "<P>" lines in HTML files). That may be because this is
				55	# the only method of the 3 that has a concept of "junk" <wink>.
				56	#
				57	# Note that ndiff makes no claim to produce a minimal diff. To the
				58	# contrary, minimal diffs are often counter-intuitive, because they
				59	# synch up anywhere possible, sometimes accidental matches 100 pages
				60	# apart. Restricting synch points to contiguous matches preserves some
				61	# notion of locality, at the occasional cost of producing a longer diff.
				62	#
Guido van Rossum	a3433e8	1999-03-27 13:34:01 +0000	[diff] [blame^]	63	# With respect to junk, an earlier version of ndiff simply refused to
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	64	# start a match with a junk element. The result was cases like this:
				65	# before: private Thread currentThread;
				66	# after: private volatile Thread currentThread;
Guido van Rossum	a3433e8	1999-03-27 13:34:01 +0000	[diff] [blame^]	67	# If you consider whitespace to be junk, the longest contiguous match
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	68	# not starting with junk is "e Thread currentThread". So ndiff reported
				69	# that "e volatil" was inserted between the 't' and the 'e' in "private".
				70	# While an accurate view, to people that's absurd. The current version
				71	# looks for matching blocks that are entirely junk-free, then extends the
				72	# longest one of those as far as possible but only with matching junk.
				73	# So now "currentThread" is matched, then extended to suck up the
				74	# preceding blank; then "private" is matched, and extended to suck up the
				75	# following blank; then "Thread" is matched; and finally ndiff reports
				76	# that "volatile " was inserted before "Thread". The only quibble
Guido van Rossum	a3433e8	1999-03-27 13:34:01 +0000	[diff] [blame^]	77	# remaining is that perhaps it was really the case that " volatile"
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	78	# was inserted after "private". I can live with that <wink>.
				79	#
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	80	# NOTE on junk: the module-level names
				81	# IS_LINE_JUNK
				82	# IS_CHARACTER_JUNK
				83	# can be set to any functions you like. The first one should accept
				84	# a single string argument, and return true iff the string is junk.
				85	# The default is whether the regexp r"\s#?\s$" matches (i.e., a
				86	# line without visible characters, except for at most one splat).
				87	# The second should accept a string of length 1 etc. The default is
				88	# whether the character is a blank or tab (note: bad idea to include
				89	# newline in this!).
				90	#
				91	# After setting those, you can call fcompare(f1name, f2name) with the
				92	# names of the files you want to compare. The difference report
Guido van Rossum	a3433e8	1999-03-27 13:34:01 +0000	[diff] [blame^]	93	# is sent to stdout. Or you can call main(args), passing what would
				94	# have been in sys.argv[1:] had the cmd-line form been used.
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	95
				96	import string
				97	TRACE = 0
				98
				99	# define what "junk" means
				100	import re
				101
				102	def IS_LINE_JUNK(line, pat=re.compile(r"\s#?\s$").match):
				103	return pat(line) is not None
				104
				105	def IS_CHARACTER_JUNK(ch, ws=" \t"):
				106	return ch in ws
				107
				108	del re
				109
				110	class SequenceMatcher:
				111	def __init__(self, isjunk=None, a='', b=''):
				112	# Members:
				113	# a
				114	# first sequence
				115	# b
				116	# second sequence; differences are computed as "what do
				117	# we need to do to 'a' to change it into 'b'?"
				118	# b2j
				119	# for x in b, b2j[x] is a list of the indices (into b)
				120	# at which x appears; junk elements do not appear
				121	# b2jhas
				122	# b2j.has_key
				123	# fullbcount
				124	# for x in b, fullbcount[x] == the number of times x
				125	# appears in b; only materialized if really needed (used
				126	# only for computing quick_ratio())
				127	# matching_blocks
				128	# a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
				129	# ascending & non-overlapping in i and in j; terminated by
				130	# a dummy (len(a), len(b), 0) sentinel
				131	# opcodes
				132	# a list of (tag, i1, i2, j1, j2) tuples, where tag is
				133	# one of
				134	# 'replace' a[i1:i2] should be replaced by b[j1:j2]
				135	# 'delete' a[i1:i2] should be deleted
				136	# 'insert' b[j1:j2] should be inserted
				137	# 'equal' a[i1:i2] == b[j1:j2]
				138	# isjunk
				139	# a user-supplied function taking a sequence element and
				140	# returning true iff the element is "junk" -- this has
				141	# subtle but helpful effects on the algorithm, which I'll
				142	# get around to writing up someday <0.9 wink>.
				143	# DON'T USE! Only __chain_b uses this. Use isbjunk.
				144	# isbjunk
				145	# for x in b, isbjunk(x) == isjunk(x) but much faster;
				146	# it's really the has_key method of a hidden dict.
				147	# DOES NOT WORK for x in a!
				148
				149	self.isjunk = isjunk
				150	self.a = self.b = None
				151	self.set_seqs(a, b)
				152
				153	def set_seqs(self, a, b):
				154	self.set_seq1(a)
				155	self.set_seq2(b)
				156
				157	def set_seq1(self, a):
				158	if a is self.a:
				159	return
				160	self.a = a
				161	self.matching_blocks = self.opcodes = None
				162
				163	def set_seq2(self, b):
				164	if b is self.b:
				165	return
				166	self.b = b
				167	self.matching_blocks = self.opcodes = None
				168	self.fullbcount = None
				169	self.__chain_b()
				170
Guido van Rossum	a3433e8	1999-03-27 13:34:01 +0000	[diff] [blame^]	171	# For each element x in b, set b2j[x] to a list of the indices in
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	172	# b where x appears; the indices are in increasing order; note that
				173	# the number of times x appears in b is len(b2j[x]) ...
				174	# when self.isjunk is defined, junk elements don't show up in this
				175	# map at all, which stops the central find_longest_match method
				176	# from starting any matching block at a junk element ...
				177	# also creates the fast isbjunk function ...
				178	# note that this is only called when b changes; so for cross-product
				179	# kinds of matches, it's best to call set_seq2 once, then set_seq1
				180	# repeatedly
				181
				182	def __chain_b(self):
				183	# Because isjunk is a user-defined (not C) function, and we test
				184	# for junk a LOT, it's important to minimize the number of calls.
				185	# Before the tricks described here, __chain_b was by far the most
				186	# time-consuming routine in the whole module! If anyone sees
				187	# Jim Roskind, thank him again for profile.py -- I never would
				188	# have guessed that.
				189	# The first trick is to build b2j ignoring the possibility
				190	# of junk. I.e., we don't call isjunk at all yet. Throwing
				191	# out the junk later is much cheaper than building b2j "right"
				192	# from the start.
				193	b = self.b
				194	self.b2j = b2j = {}
				195	self.b2jhas = b2jhas = b2j.has_key
Guido van Rossum	a3433e8	1999-03-27 13:34:01 +0000	[diff] [blame^]	196	for i in xrange(len(b)):
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	197	elt = b[i]
				198	if b2jhas(elt):
				199	b2j[elt].append(i)
				200	else:
				201	b2j[elt] = [i]
				202
				203	# Now b2j.keys() contains elements uniquely, and especially when
				204	# the sequence is a string, that's usually a good deal smaller
				205	# than len(string). The difference is the number of isjunk calls
				206	# saved.
				207	isjunk, junkdict = self.isjunk, {}
				208	if isjunk:
				209	for elt in b2j.keys():
				210	if isjunk(elt):
				211	junkdict[elt] = 1 # value irrelevant; it's a set
				212	del b2j[elt]
				213
				214	# Now for x in b, isjunk(x) == junkdict.has_key(x), but the
				215	# latter is much faster. Note too that while there may be a
				216	# lot of junk in the sequence, the number of unique junk
				217	# elements is probably small. So the memory burden of keeping
				218	# this dict alive is likely trivial compared to the size of b2j.
				219	self.isbjunk = junkdict.has_key
				220
				221	def find_longest_match(self, alo, ahi, blo, bhi):
				222	"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
				223
				224	If isjunk is not defined:
				225
				226	Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
				227	alo <= i <= i+k <= ahi
				228	blo <= j <= j+k <= bhi
				229	and for all (i',j',k') meeting those conditions,
				230	k >= k'
				231	i <= i'
				232	and if i == i', j <= j'
Guido van Rossum	a3433e8	1999-03-27 13:34:01 +0000	[diff] [blame^]	233	In other words, of all maximal matching blocks, return one
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	234	that starts earliest in a, and of all those maximal matching
Guido van Rossum	a3433e8	1999-03-27 13:34:01 +0000	[diff] [blame^]	235	blocks that start earliest in a, return the one that starts
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	236	earliest in b.
				237
				238	If isjunk is defined, first the longest matching block is
				239	determined as above, but with the additional restriction that
				240	no junk element appears in the block. Then that block is
				241	extended as far as possible by matching (only) junk elements on
				242	both sides. So the resulting block never matches on junk except
				243	as identical junk happens to be adjacent to an "interesting"
				244	match.
				245
Guido van Rossum	a3433e8	1999-03-27 13:34:01 +0000	[diff] [blame^]	246	If no blocks match, return (alo, blo, 0).
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	247	"""
				248
				249	# CAUTION: stripping common prefix or suffix would be incorrect.
				250	# E.g.,
				251	# ab
				252	# acab
				253	# Longest matching block is "ab", but if common prefix is
				254	# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
				255	# strip, so ends up claiming that ab is changed to acab by
				256	# inserting "ca" in the middle. That's minimal but unintuitive:
				257	# "it's obvious" that someone inserted "ac" at the front.
				258	# Windiff ends up at the same place as diff, but by pairing up
				259	# the unique 'b's and then matching the first two 'a's.
				260
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	261	a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
				262	besti, bestj, bestsize = alo, blo, 0
Guido van Rossum	a3433e8	1999-03-27 13:34:01 +0000	[diff] [blame^]	263	# find longest junk-free match
				264	# during an iteration of the loop, j2len[j] = length of longest
				265	# junk-free match ending with a[i-1] and b[j]
				266	j2len = {}
				267	nothing = []
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	268	for i in xrange(alo, ahi):
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	269	# look at all instances of a[i] in b; note that because
				270	# b2j has no junk keys, the loop is skipped if a[i] is junk
Guido van Rossum	a3433e8	1999-03-27 13:34:01 +0000	[diff] [blame^]	271	j2lenget = j2len.get
				272	newj2len = {}
				273	for j in b2j.get(a[i], nothing):
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	274	# a[i] matches b[j]
				275	if j < blo:
				276	continue
Guido van Rossum	a3433e8	1999-03-27 13:34:01 +0000	[diff] [blame^]	277	if j >= bhi:
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	278	break
Guido van Rossum	a3433e8	1999-03-27 13:34:01 +0000	[diff] [blame^]	279	k = newj2len[j] = j2lenget(j-1, 0) + 1
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	280	if k > bestsize:
Guido van Rossum	a3433e8	1999-03-27 13:34:01 +0000	[diff] [blame^]	281	besti, bestj, bestsize = i-k+1, j-k+1, k
				282	j2len = newj2len
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	283
				284	# Now that we have a wholly interesting match (albeit possibly
				285	# empty!), we may as well suck up the matching junk on each
				286	# side of it too. Can't think of a good reason not to, and it
				287	# saves post-processing the (possibly considerable) expense of
				288	# figuring out what to do with it. In the case of an empty
				289	# interesting match, this is clearly the right thing to do,
				290	# because no other kind of match is possible in the regions.
				291	while besti > alo and bestj > blo and \
				292	isbjunk(b[bestj-1]) and \
				293	a[besti-1] == b[bestj-1]:
				294	besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
				295	while besti+bestsize < ahi and bestj+bestsize < bhi and \
				296	isbjunk(b[bestj+bestsize]) and \
				297	a[besti+bestsize] == b[bestj+bestsize]:
				298	bestsize = bestsize + 1
				299
				300	if TRACE:
				301	print "get_matching_blocks", alo, ahi, blo, bhi
				302	print " returns", besti, bestj, bestsize
				303	return besti, bestj, bestsize
				304
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	305	def get_matching_blocks(self):
				306	if self.matching_blocks is not None:
				307	return self.matching_blocks
				308	self.matching_blocks = []
				309	la, lb = len(self.a), len(self.b)
				310	self.__helper(0, la, 0, lb, self.matching_blocks)
				311	self.matching_blocks.append( (la, lb, 0) )
				312	if TRACE:
				313	print '*** matching blocks', self.matching_blocks
				314	return self.matching_blocks
				315
				316	# builds list of matching blocks covering a[alo:ahi] and
				317	# b[blo:bhi], appending them in increasing order to answer
				318
				319	def __helper(self, alo, ahi, blo, bhi, answer):
				320	i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
				321	# a[alo:i] vs b[blo:j] unknown
				322	# a[i:i+k] same as b[j:j+k]
				323	# a[i+k:ahi] vs b[j+k:bhi] unknown
				324	if k:
				325	if alo < i and blo < j:
				326	self.__helper(alo, i, blo, j, answer)
				327	answer.append( x )
				328	if i+k < ahi and j+k < bhi:
				329	self.__helper(i+k, ahi, j+k, bhi, answer)
				330
				331	def ratio(self):
				332	"""Return a measure of the sequences' similarity (float in [0,1]).
				333
				334	Where T is the total number of elements in both sequences, and
				335	M is the number of matches, this is 2*M / T.
				336	Note that this is 1 if the sequences are identical, and 0 if
				337	they have nothing in common.
				338	"""
				339
				340	matches = reduce(lambda sum, triple: sum + triple[-1],
				341	self.get_matching_blocks(), 0)
				342	return 2.0 * matches / (len(self.a) + len(self.b))
				343
				344	def quick_ratio(self):
				345	"""Return an upper bound on ratio() relatively quickly."""
				346	# viewing a and b as multisets, set matches to the cardinality
				347	# of their intersection; this counts the number of matches
				348	# without regard to order, so is clearly an upper bound
				349	if self.fullbcount is None:
				350	self.fullbcount = fullbcount = {}
				351	for elt in self.b:
				352	fullbcount[elt] = fullbcount.get(elt, 0) + 1
				353	fullbcount = self.fullbcount
				354	# avail[x] is the number of times x appears in 'b' less the
				355	# number of times we've seen it in 'a' so far ... kinda
				356	avail = {}
				357	availhas, matches = avail.has_key, 0
				358	for elt in self.a:
				359	if availhas(elt):
				360	numb = avail[elt]
				361	else:
				362	numb = fullbcount.get(elt, 0)
				363	avail[elt] = numb - 1
				364	if numb > 0:
				365	matches = matches + 1
				366	return 2.0 * matches / (len(self.a) + len(self.b))
				367
				368	def real_quick_ratio(self):
				369	"""Return an upper bound on ratio() very quickly"""
				370	la, lb = len(self.a), len(self.b)
				371	# can't have more matches than the number of elements in the
				372	# shorter sequence
				373	return 2.0 * min(la, lb) / (la + lb)
				374
				375	def get_opcodes(self):
				376	if self.opcodes is not None:
				377	return self.opcodes
				378	i = j = 0
				379	self.opcodes = answer = []
				380	for ai, bj, size in self.get_matching_blocks():
				381	# invariant: we've pumped out correct diffs to change
				382	# a[:i] into b[:j], and the next matching block is
				383	# a[ai:ai+size] == b[bj:bj+size]. So we need to pump
				384	# out a diff to change a[i:ai] into b[j:bj], pump out
				385	# the matching block, and move (i,j) beyond the match
				386	tag = ''
				387	if i < ai and j < bj:
				388	tag = 'replace'
				389	elif i < ai:
				390	tag = 'delete'
				391	elif j < bj:
				392	tag = 'insert'
				393	if tag:
				394	answer.append( (tag, i, ai, j, bj) )
				395	i, j = ai+size, bj+size
				396	# the list of matching blocks is terminated by a
				397	# sentinel with size 0
				398	if size:
				399	answer.append( ('equal', ai, i, bj, j) )
				400	return answer
				401
				402	# meant for dumping lines
				403	def dump(tag, x, lo, hi):
				404	for i in xrange(lo, hi):
				405	print tag, x[i],
				406
				407	# figure out which mark to stick under characters in lines that
				408	# have changed (blank = same, - = deleted, + = inserted, ^ = replaced)
				409	_combine = { ' ': ' ',
				410	'. ': '-',
				411	' .': '+',
				412	'..': '^' }
				413
				414	def plain_replace(a, alo, ahi, b, blo, bhi):
				415	assert alo < ahi and blo < bhi
				416	# dump the shorter block first -- reduces the burden on short-term
				417	# memory if the blocks are of very different sizes
				418	if bhi - blo < ahi - alo:
				419	dump('+', b, blo, bhi)
				420	dump('-', a, alo, ahi)
				421	else:
				422	dump('-', a, alo, ahi)
				423	dump('+', b, blo, bhi)
				424
				425	# When replacing one block of lines with another, this guy searches
				426	# the blocks for similar lines; the best-matching pair (if any) is
				427	# used as a synch point, and intraline difference marking is done on
				428	# the similar pair. Lots of work, but often worth it.
				429
				430	def fancy_replace(a, alo, ahi, b, blo, bhi):
				431	if TRACE:
				432	print '*** fancy_replace', alo, ahi, blo, bhi
				433	dump('>', a, alo, ahi)
				434	dump('<', b, blo, bhi)
				435
				436	# don't synch up unless the lines have a similarity score of at
				437	# least cutoff; best_ratio tracks the best score seen so far
				438	best_ratio, cutoff = 0.74, 0.75
				439	cruncher = SequenceMatcher(IS_CHARACTER_JUNK)
				440	eqi, eqj = None, None # 1st indices of equal lines (if any)
				441
				442	# search for the pair that matches best without being identical
				443	# (identical lines must be junk lines, & we don't want to synch up
				444	# on junk -- unless we have to)
				445	for j in xrange(blo, bhi):
				446	bj = b[j]
				447	cruncher.set_seq2(bj)
				448	for i in xrange(alo, ahi):
				449	ai = a[i]
				450	if ai == bj:
				451	if eqi is None:
				452	eqi, eqj = i, j
				453	continue
				454	cruncher.set_seq1(ai)
				455	# computing similarity is expensive, so use the quick
				456	# upper bounds first -- have seen this speed up messy
				457	# compares by a factor of 3.
				458	# note that ratio() is only expensive to compute the first
				459	# time it's called on a sequence pair; the expensive part
				460	# of the computation is cached by cruncher
				461	if cruncher.real_quick_ratio() > best_ratio and \
				462	cruncher.quick_ratio() > best_ratio and \
				463	cruncher.ratio() > best_ratio:
				464	best_ratio, best_i, best_j = cruncher.ratio(), i, j
				465	if best_ratio < cutoff:
				466	# no non-identical "pretty close" pair
				467	if eqi is None:
				468	# no identical pair either -- treat it as a straight replace
				469	plain_replace(a, alo, ahi, b, blo, bhi)
				470	return
				471	# no close pair, but an identical pair -- synch up on that
				472	best_i, best_j, best_ratio = eqi, eqj, 1.0
				473	else:
				474	# there's a close pair, so forget the identical pair (if any)
				475	eqi = None
				476
				477	# a[best_i] very similar to b[best_j]; eqi is None iff they're not
				478	# identical
				479	if TRACE:
				480	print '*** best_ratio', best_ratio, best_i, best_j
				481	dump('>', a, best_i, best_i+1)
				482	dump('<', b, best_j, best_j+1)
				483
				484	# pump out diffs from before the synch point
				485	fancy_helper(a, alo, best_i, b, blo, best_j)
				486
				487	# do intraline marking on the synch pair
				488	aelt, belt = a[best_i], b[best_j]
				489	if eqi is None:
				490	# pump out a '-', '+', '?' triple for the synched lines;
				491	atags = btags = ""
				492	cruncher.set_seqs(aelt, belt)
				493	for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
				494	la, lb = ai2 - ai1, bj2 - bj1
				495	if tag == 'replace':
				496	atags = atags + '.' * la
				497	btags = btags + '.' * lb
				498	elif tag == 'delete':
				499	atags = atags + '.' * la
				500	elif tag == 'insert':
				501	btags = btags + '.' * lb
				502	elif tag == 'equal':
				503	atags = atags + ' ' * la
				504	btags = btags + ' ' * lb
				505	else:
				506	raise ValueError, 'unknown tag ' + `tag`
				507	la, lb = len(atags), len(btags)
				508	if la < lb:
				509	atags = atags + ' ' * (lb - la)
				510	elif lb < la:
				511	btags = btags + ' ' * (la - lb)
				512	combined = map(lambda x,y: _combine[x+y], atags, btags)
				513	print '-', aelt, '+', belt, '?', \
				514	string.rstrip(string.join(combined, ''))
				515	else:
				516	# the synch pair is identical
				517	print ' ', aelt,
				518
				519	# pump out diffs from after the synch point
				520	fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)
				521
				522	def fancy_helper(a, alo, ahi, b, blo, bhi):
				523	if alo < ahi:
				524	if blo < bhi:
				525	fancy_replace(a, alo, ahi, b, blo, bhi)
				526	else:
				527	dump('-', a, alo, ahi)
				528	elif blo < bhi:
				529	dump('+', b, blo, bhi)
				530
				531	# open a file & return the file object; gripe and return 0 if it
				532	# couldn't be opened
				533	def fopen(fname):
				534	try:
				535	return open(fname, 'r')
				536	except IOError, detail:
Guido van Rossum	a3433e8	1999-03-27 13:34:01 +0000	[diff] [blame^]	537	print "couldn't open " + fname + ": " + str(detail)
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	538	return 0
				539
				540	# open two files & spray the diff to stdout; return false iff a problem
				541	def fcompare(f1name, f2name):
				542	f1 = fopen(f1name)
				543	f2 = fopen(f2name)
				544	if not f1 or not f2:
				545	return 0
				546
				547	a = f1.readlines(); f1.close()
				548	b = f2.readlines(); f2.close()
				549
				550	cruncher = SequenceMatcher(IS_LINE_JUNK, a, b)
				551	for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
				552	if tag == 'replace':
				553	fancy_replace(a, alo, ahi, b, blo, bhi)
				554	elif tag == 'delete':
				555	dump('-', a, alo, ahi)
				556	elif tag == 'insert':
				557	dump('+', b, blo, bhi)
				558	elif tag == 'equal':
				559	dump(' ', a, alo, ahi)
				560	else:
				561	raise ValueError, 'unknown tag ' + `tag`
				562
				563	return 1
				564
Guido van Rossum	a3433e8	1999-03-27 13:34:01 +0000	[diff] [blame^]	565	# crack args (sys.argv[1:] is normal) & compare;
				566	# return false iff a problem
				567
				568	def main(args):
				569	import getopt
				570	try:
				571	opts, args = getopt.getopt(args, "q")
				572	except getopt.error, detail:
				573	print str(detail)
				574	print __doc__
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	575	return 0
Guido van Rossum	a3433e8	1999-03-27 13:34:01 +0000	[diff] [blame^]	576	noisy = 1
				577	for opt, val in opts:
				578	if opt == "-q":
				579	noisy = 0
				580	if len(args) != 2:
				581	print 'need 2 args'
				582	print __doc__
				583	return 0
				584	f1name, f2name = args
				585	if noisy:
				586	print '-:', f1name
				587	print '+:', f2name
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	588	return fcompare(f1name, f2name)
				589
				590	if __name__ == '__main__':
Guido van Rossum	a3433e8	1999-03-27 13:34:01 +0000	[diff] [blame^]	591	import sys
				592	args = sys.argv[1:]
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	593	if 1:
Guido van Rossum	a3433e8	1999-03-27 13:34:01 +0000	[diff] [blame^]	594	main(args)
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	595	else:
				596	import profile, pstats
				597	statf = "ndiff.pro"
Guido van Rossum	a3433e8	1999-03-27 13:34:01 +0000	[diff] [blame^]	598	profile.run("main(args)", statf)
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	599	stats = pstats.Stats(statf)
				600	stats.strip_dirs().sort_stats('time').print_stats()