Blame - Tools/scripts/ndiff.py - platform/external/python/cpython2

blob: 2ba5e5370596e751848366927b32ea5e7b2ac0b4 [file] [log] [blame]

Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	1	#! /usr/bin/env python
				2
				3	# Released to the public domain $JustDate: 3/16/98 $,
				4	# by Tim Peters (email tim_one@email.msn.com).
				5
				6	# ndiff file1 file2 -- a human-friendly file differencer.
				7
				8	# $Revision$
				9	# $NoKeywords: $
				10
				11	# SequenceMatcher tries to compute a "human-friendly diff" between
				12	# two sequences (chiefly picturing a file as a sequence of lines,
				13	# and a line as a sequence of characters, here). Unlike UNIX(tm) diff,
				14	# e.g., the fundamental notion is the longest contiguous & junk-free
				15	# matching subsequence. That's what catches peoples' eyes. The
				16	# Windows(tm) windiff has another interesting notion, pairing up elements
				17	# that appear uniquely in each sequence. That, and the method here,
				18	# appear to yield more intuitive difference reports than does diff. This
				19	# method appears to be the least vulnerable to synching up on blocks
				20	# of "junk lines", though (like blank lines in ordinary text files,
				21	# or maybe "<P>" lines in HTML files). That may be because this is
				22	# the only method of the 3 that has a concept of "junk" <wink>.
				23	#
				24	# Note that ndiff makes no claim to produce a minimal diff. To the
				25	# contrary, minimal diffs are often counter-intuitive, because they
				26	# synch up anywhere possible, sometimes accidental matches 100 pages
				27	# apart. Restricting synch points to contiguous matches preserves some
				28	# notion of locality, at the occasional cost of producing a longer diff.
				29	#
				30	# With respect to junk, an earlier verion of ndiff simply refused to
				31	# start a match with a junk element. The result was cases like this:
				32	# before: private Thread currentThread;
				33	# after: private volatile Thread currentThread;
				34	# If you consider whitespace to be junk, the longest continguous match
				35	# not starting with junk is "e Thread currentThread". So ndiff reported
				36	# that "e volatil" was inserted between the 't' and the 'e' in "private".
				37	# While an accurate view, to people that's absurd. The current version
				38	# looks for matching blocks that are entirely junk-free, then extends the
				39	# longest one of those as far as possible but only with matching junk.
				40	# So now "currentThread" is matched, then extended to suck up the
				41	# preceding blank; then "private" is matched, and extended to suck up the
				42	# following blank; then "Thread" is matched; and finally ndiff reports
				43	# that "volatile " was inserted before "Thread". The only quibble
				44	# remaining is that perhaps it was really the case that " volative"
				45	# was inserted after "private". I can live with that <wink>.
				46	#
				47	# NOTE on the output: From an ndiff report,
				48	# 1) The first file can be recovered by retaining only lines that begin
				49	# with " " or "- ", and deleting those 2-character prefixes.
				50	# 2) The second file can be recovered similarly, but by retaining only
				51	# " " and "+ " lines.
				52	# 3) Lines beginning with "? " attempt to guide the eye to intraline
				53	# differences, and were not present in either input file.
				54	#
				55	# NOTE on junk: the module-level names
				56	# IS_LINE_JUNK
				57	# IS_CHARACTER_JUNK
				58	# can be set to any functions you like. The first one should accept
				59	# a single string argument, and return true iff the string is junk.
				60	# The default is whether the regexp r"\s#?\s$" matches (i.e., a
				61	# line without visible characters, except for at most one splat).
				62	# The second should accept a string of length 1 etc. The default is
				63	# whether the character is a blank or tab (note: bad idea to include
				64	# newline in this!).
				65	#
				66	# After setting those, you can call fcompare(f1name, f2name) with the
				67	# names of the files you want to compare. The difference report
				68	# is sent to stdout. Or you can call main(), which expects to find
				69	# (exactly) the two file names in sys.argv.
				70
				71	import string
				72	TRACE = 0
				73
				74	# define what "junk" means
				75	import re
				76
				77	def IS_LINE_JUNK(line, pat=re.compile(r"\s#?\s$").match):
				78	return pat(line) is not None
				79
				80	def IS_CHARACTER_JUNK(ch, ws=" \t"):
				81	return ch in ws
				82
				83	del re
				84
				85	class SequenceMatcher:
				86	def __init__(self, isjunk=None, a='', b=''):
				87	# Members:
				88	# a
				89	# first sequence
				90	# b
				91	# second sequence; differences are computed as "what do
				92	# we need to do to 'a' to change it into 'b'?"
				93	# b2j
				94	# for x in b, b2j[x] is a list of the indices (into b)
				95	# at which x appears; junk elements do not appear
				96	# b2jhas
				97	# b2j.has_key
				98	# fullbcount
				99	# for x in b, fullbcount[x] == the number of times x
				100	# appears in b; only materialized if really needed (used
				101	# only for computing quick_ratio())
				102	# matching_blocks
				103	# a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
				104	# ascending & non-overlapping in i and in j; terminated by
				105	# a dummy (len(a), len(b), 0) sentinel
				106	# opcodes
				107	# a list of (tag, i1, i2, j1, j2) tuples, where tag is
				108	# one of
				109	# 'replace' a[i1:i2] should be replaced by b[j1:j2]
				110	# 'delete' a[i1:i2] should be deleted
				111	# 'insert' b[j1:j2] should be inserted
				112	# 'equal' a[i1:i2] == b[j1:j2]
				113	# isjunk
				114	# a user-supplied function taking a sequence element and
				115	# returning true iff the element is "junk" -- this has
				116	# subtle but helpful effects on the algorithm, which I'll
				117	# get around to writing up someday <0.9 wink>.
				118	# DON'T USE! Only __chain_b uses this. Use isbjunk.
				119	# isbjunk
				120	# for x in b, isbjunk(x) == isjunk(x) but much faster;
				121	# it's really the has_key method of a hidden dict.
				122	# DOES NOT WORK for x in a!
				123
				124	self.isjunk = isjunk
				125	self.a = self.b = None
				126	self.set_seqs(a, b)
				127
				128	def set_seqs(self, a, b):
				129	self.set_seq1(a)
				130	self.set_seq2(b)
				131
				132	def set_seq1(self, a):
				133	if a is self.a:
				134	return
				135	self.a = a
				136	self.matching_blocks = self.opcodes = None
				137
				138	def set_seq2(self, b):
				139	if b is self.b:
				140	return
				141	self.b = b
				142	self.matching_blocks = self.opcodes = None
				143	self.fullbcount = None
				144	self.__chain_b()
				145
				146	# for each element x in b, set b2j[x] to a list of the indices in
				147	# b where x appears; the indices are in increasing order; note that
				148	# the number of times x appears in b is len(b2j[x]) ...
				149	# when self.isjunk is defined, junk elements don't show up in this
				150	# map at all, which stops the central find_longest_match method
				151	# from starting any matching block at a junk element ...
				152	# also creates the fast isbjunk function ...
				153	# note that this is only called when b changes; so for cross-product
				154	# kinds of matches, it's best to call set_seq2 once, then set_seq1
				155	# repeatedly
				156
				157	def __chain_b(self):
				158	# Because isjunk is a user-defined (not C) function, and we test
				159	# for junk a LOT, it's important to minimize the number of calls.
				160	# Before the tricks described here, __chain_b was by far the most
				161	# time-consuming routine in the whole module! If anyone sees
				162	# Jim Roskind, thank him again for profile.py -- I never would
				163	# have guessed that.
				164	# The first trick is to build b2j ignoring the possibility
				165	# of junk. I.e., we don't call isjunk at all yet. Throwing
				166	# out the junk later is much cheaper than building b2j "right"
				167	# from the start.
				168	b = self.b
				169	self.b2j = b2j = {}
				170	self.b2jhas = b2jhas = b2j.has_key
				171	for i in xrange(0, len(b)):
				172	elt = b[i]
				173	if b2jhas(elt):
				174	b2j[elt].append(i)
				175	else:
				176	b2j[elt] = [i]
				177
				178	# Now b2j.keys() contains elements uniquely, and especially when
				179	# the sequence is a string, that's usually a good deal smaller
				180	# than len(string). The difference is the number of isjunk calls
				181	# saved.
				182	isjunk, junkdict = self.isjunk, {}
				183	if isjunk:
				184	for elt in b2j.keys():
				185	if isjunk(elt):
				186	junkdict[elt] = 1 # value irrelevant; it's a set
				187	del b2j[elt]
				188
				189	# Now for x in b, isjunk(x) == junkdict.has_key(x), but the
				190	# latter is much faster. Note too that while there may be a
				191	# lot of junk in the sequence, the number of unique junk
				192	# elements is probably small. So the memory burden of keeping
				193	# this dict alive is likely trivial compared to the size of b2j.
				194	self.isbjunk = junkdict.has_key
				195
				196	def find_longest_match(self, alo, ahi, blo, bhi):
				197	"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
				198
				199	If isjunk is not defined:
				200
				201	Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
				202	alo <= i <= i+k <= ahi
				203	blo <= j <= j+k <= bhi
				204	and for all (i',j',k') meeting those conditions,
				205	k >= k'
				206	i <= i'
				207	and if i == i', j <= j'
				208	In other words, of all maximal matching blocks, returns one
				209	that starts earliest in a, and of all those maximal matching
				210	blocks that start earliest in a, returns the one that starts
				211	earliest in b.
				212
				213	If isjunk is defined, first the longest matching block is
				214	determined as above, but with the additional restriction that
				215	no junk element appears in the block. Then that block is
				216	extended as far as possible by matching (only) junk elements on
				217	both sides. So the resulting block never matches on junk except
				218	as identical junk happens to be adjacent to an "interesting"
				219	match.
				220
				221	If no blocks match, returns (alo, blo, 0).
				222	"""
				223
				224	# CAUTION: stripping common prefix or suffix would be incorrect.
				225	# E.g.,
				226	# ab
				227	# acab
				228	# Longest matching block is "ab", but if common prefix is
				229	# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
				230	# strip, so ends up claiming that ab is changed to acab by
				231	# inserting "ca" in the middle. That's minimal but unintuitive:
				232	# "it's obvious" that someone inserted "ac" at the front.
				233	# Windiff ends up at the same place as diff, but by pairing up
				234	# the unique 'b's and then matching the first two 'a's.
				235
				236	# find longest junk-free match
				237	a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
				238	besti, bestj, bestsize = alo, blo, 0
				239	for i in xrange(alo, ahi):
				240	# check for longest match starting at a[i]
				241	if i + bestsize >= ahi:
				242	# we're too far right to get a new best
				243	break
				244	# look at all instances of a[i] in b; note that because
				245	# b2j has no junk keys, the loop is skipped if a[i] is junk
				246	for j in b2j.get(a[i], []):
				247	# a[i] matches b[j]
				248	if j < blo:
				249	continue
				250	if j + bestsize >= bhi:
				251	# we're too far right to get a new best, here or
				252	# anywhere to the right
				253	break
				254	if a[i + bestsize] != b[j + bestsize]:
				255	# can't be longer match; this test is not necessary
				256	# for correctness, but is a huge win for efficiency
				257	continue
				258	# set k to length of match
				259	k = 1 # a[i] == b[j] already known
				260	while i + k < ahi and j + k < bhi and \
				261	a[i+k] == b[j+k] and not isbjunk(b[j+k]):
				262	k = k + 1
				263	if k > bestsize:
				264	besti, bestj, bestsize = i, j, k
				265	if i + bestsize >= ahi:
				266	# only time in my life I really wanted a
				267	# labelled break <wink> -- we're done with
				268	# both loops now
				269	break
				270
				271	# Now that we have a wholly interesting match (albeit possibly
				272	# empty!), we may as well suck up the matching junk on each
				273	# side of it too. Can't think of a good reason not to, and it
				274	# saves post-processing the (possibly considerable) expense of
				275	# figuring out what to do with it. In the case of an empty
				276	# interesting match, this is clearly the right thing to do,
				277	# because no other kind of match is possible in the regions.
				278	while besti > alo and bestj > blo and \
				279	isbjunk(b[bestj-1]) and \
				280	a[besti-1] == b[bestj-1]:
				281	besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
				282	while besti+bestsize < ahi and bestj+bestsize < bhi and \
				283	isbjunk(b[bestj+bestsize]) and \
				284	a[besti+bestsize] == b[bestj+bestsize]:
				285	bestsize = bestsize + 1
				286
				287	if TRACE:
				288	print "get_matching_blocks", alo, ahi, blo, bhi
				289	print " returns", besti, bestj, bestsize
				290	return besti, bestj, bestsize
				291
				292	# A different implementation, using a binary doubling technique that
				293	# does far fewer element compares (trades 'em for integer compares),
				294	# and has n*lg n worst-case behavior. Alas, the code is much harder
				295	# to follow (the details are tricky!), and in most cases I've seen,
				296	# it takes at least 50% longer than the "clever dumb" method above;
				297	# probably due to creating layers of small dicts.
				298	# NOTE: this no longer matches the version above wrt junk; remains
				299	# too unpromising to update it; someday, though ...
				300
				301	# def find_longest_match(self, alo, ahi, blo, bhi):
				302	# """Find longest matching block in a[alo:ahi] and b[blo:bhi].
				303	#
				304	# Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
				305	# alo <= i <= i+k <= ahi
				306	# blo <= j <= j+k <= bhi
				307	# and for all (i',j',k') meeting those conditions,
				308	# k >= k'
				309	# i <= i'
				310	# and if i == i', j <= j'
				311	# In other words, of all maximal matching blocks, returns one
				312	# that starts earliest in a, and of all those maximal matching
				313	# blocks that start earliest in a, returns the one that starts
				314	# earliest in b.
				315	#
				316	# If no blocks match, returns (alo, blo, 0).
				317	# """
				318	#
				319	# a, b2j = self.a, self.b2j
				320	# # alljs[size][i] is a set of all j's s.t. a[i:i+len] matches
				321	# # b[j:j+len]
				322	# alljs = {}
				323	# alljs[1] = js = {}
				324	# ahits = {}
				325	# for i in xrange(alo, ahi):
				326	# elt = a[i]
				327	# if ahits.has_key(elt):
				328	# js[i] = ahits[elt]
				329	# continue
				330	# if b2j.has_key(elt):
				331	# in_range = {}
				332	# for j in b2j[elt]:
				333	# if j >= blo:
				334	# if j >= bhi:
				335	# break
				336	# in_range[j] = 1
				337	# if in_range:
				338	# ahits[elt] = js[i] = in_range
				339	# del ahits
				340	# size = 1
				341	# while js:
				342	# oldsize = size
				343	# size = size + size
				344	# oldjs = js
				345	# alljs[size] = js = {}
				346	# for i in oldjs.keys():
				347	# # i has matches of size oldsize
				348	# if not oldjs.has_key(i + oldsize):
				349	# # can't double it
				350	# continue
				351	# second_js = oldjs[i + oldsize]
				352	# answer = {}
				353	# for j in oldjs[i].keys():
				354	# if second_js.has_key(j + oldsize):
				355	# answer[j] = 1
				356	# if answer:
				357	# js[i] = answer
				358	# del alljs[size]
				359	# size = size >> 1 # max power of 2 with a match
				360	# if not size:
				361	# return alo, blo, 0
				362	# besti, bestj, bestsize = alo, blo, 0
				363	# fatis = alljs[size].keys()
				364	# fatis.sort()
				365	# for i in fatis:
				366	# # figure out longest match starting at a[i]
				367	# totalsize = halfsize = size
				368	# # i has matches of len totalsize at the indices in js
				369	# js = alljs[size][i].keys()
				370	# while halfsize > 1:
				371	# halfsize = halfsize >> 1
				372	# # is there a match of len halfsize starting at
				373	# # i + totalsize?
				374	# newjs = []
				375	# if alljs[halfsize].has_key(i + totalsize):
				376	# second_js = alljs[halfsize][i + totalsize]
				377	# for j in js:
				378	# if second_js.has_key(j + totalsize):
				379	# newjs.append(j)
				380	# if newjs:
				381	# totalsize = totalsize + halfsize
				382	# js = newjs
				383	# if totalsize > bestsize:
				384	# besti, bestj, bestsize = i, min(js), totalsize
				385	# return besti, bestj, bestsize
				386
				387	def get_matching_blocks(self):
				388	if self.matching_blocks is not None:
				389	return self.matching_blocks
				390	self.matching_blocks = []
				391	la, lb = len(self.a), len(self.b)
				392	self.__helper(0, la, 0, lb, self.matching_blocks)
				393	self.matching_blocks.append( (la, lb, 0) )
				394	if TRACE:
				395	print '*** matching blocks', self.matching_blocks
				396	return self.matching_blocks
				397
				398	# builds list of matching blocks covering a[alo:ahi] and
				399	# b[blo:bhi], appending them in increasing order to answer
				400
				401	def __helper(self, alo, ahi, blo, bhi, answer):
				402	i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
				403	# a[alo:i] vs b[blo:j] unknown
				404	# a[i:i+k] same as b[j:j+k]
				405	# a[i+k:ahi] vs b[j+k:bhi] unknown
				406	if k:
				407	if alo < i and blo < j:
				408	self.__helper(alo, i, blo, j, answer)
				409	answer.append( x )
				410	if i+k < ahi and j+k < bhi:
				411	self.__helper(i+k, ahi, j+k, bhi, answer)
				412
				413	def ratio(self):
				414	"""Return a measure of the sequences' similarity (float in [0,1]).
				415
				416	Where T is the total number of elements in both sequences, and
				417	M is the number of matches, this is 2*M / T.
				418	Note that this is 1 if the sequences are identical, and 0 if
				419	they have nothing in common.
				420	"""
				421
				422	matches = reduce(lambda sum, triple: sum + triple[-1],
				423	self.get_matching_blocks(), 0)
				424	return 2.0 * matches / (len(self.a) + len(self.b))
				425
				426	def quick_ratio(self):
				427	"""Return an upper bound on ratio() relatively quickly."""
				428	# viewing a and b as multisets, set matches to the cardinality
				429	# of their intersection; this counts the number of matches
				430	# without regard to order, so is clearly an upper bound
				431	if self.fullbcount is None:
				432	self.fullbcount = fullbcount = {}
				433	for elt in self.b:
				434	fullbcount[elt] = fullbcount.get(elt, 0) + 1
				435	fullbcount = self.fullbcount
				436	# avail[x] is the number of times x appears in 'b' less the
				437	# number of times we've seen it in 'a' so far ... kinda
				438	avail = {}
				439	availhas, matches = avail.has_key, 0
				440	for elt in self.a:
				441	if availhas(elt):
				442	numb = avail[elt]
				443	else:
				444	numb = fullbcount.get(elt, 0)
				445	avail[elt] = numb - 1
				446	if numb > 0:
				447	matches = matches + 1
				448	return 2.0 * matches / (len(self.a) + len(self.b))
				449
				450	def real_quick_ratio(self):
				451	"""Return an upper bound on ratio() very quickly"""
				452	la, lb = len(self.a), len(self.b)
				453	# can't have more matches than the number of elements in the
				454	# shorter sequence
				455	return 2.0 * min(la, lb) / (la + lb)
				456
				457	def get_opcodes(self):
				458	if self.opcodes is not None:
				459	return self.opcodes
				460	i = j = 0
				461	self.opcodes = answer = []
				462	for ai, bj, size in self.get_matching_blocks():
				463	# invariant: we've pumped out correct diffs to change
				464	# a[:i] into b[:j], and the next matching block is
				465	# a[ai:ai+size] == b[bj:bj+size]. So we need to pump
				466	# out a diff to change a[i:ai] into b[j:bj], pump out
				467	# the matching block, and move (i,j) beyond the match
				468	tag = ''
				469	if i < ai and j < bj:
				470	tag = 'replace'
				471	elif i < ai:
				472	tag = 'delete'
				473	elif j < bj:
				474	tag = 'insert'
				475	if tag:
				476	answer.append( (tag, i, ai, j, bj) )
				477	i, j = ai+size, bj+size
				478	# the list of matching blocks is terminated by a
				479	# sentinel with size 0
				480	if size:
				481	answer.append( ('equal', ai, i, bj, j) )
				482	return answer
				483
				484	# meant for dumping lines
				485	def dump(tag, x, lo, hi):
				486	for i in xrange(lo, hi):
				487	print tag, x[i],
				488
				489	# figure out which mark to stick under characters in lines that
				490	# have changed (blank = same, - = deleted, + = inserted, ^ = replaced)
				491	_combine = { ' ': ' ',
				492	'. ': '-',
				493	' .': '+',
				494	'..': '^' }
				495
				496	def plain_replace(a, alo, ahi, b, blo, bhi):
				497	assert alo < ahi and blo < bhi
				498	# dump the shorter block first -- reduces the burden on short-term
				499	# memory if the blocks are of very different sizes
				500	if bhi - blo < ahi - alo:
				501	dump('+', b, blo, bhi)
				502	dump('-', a, alo, ahi)
				503	else:
				504	dump('-', a, alo, ahi)
				505	dump('+', b, blo, bhi)
				506
				507	# When replacing one block of lines with another, this guy searches
				508	# the blocks for similar lines; the best-matching pair (if any) is
				509	# used as a synch point, and intraline difference marking is done on
				510	# the similar pair. Lots of work, but often worth it.
				511
				512	def fancy_replace(a, alo, ahi, b, blo, bhi):
				513	if TRACE:
				514	print '*** fancy_replace', alo, ahi, blo, bhi
				515	dump('>', a, alo, ahi)
				516	dump('<', b, blo, bhi)
				517
				518	# don't synch up unless the lines have a similarity score of at
				519	# least cutoff; best_ratio tracks the best score seen so far
				520	best_ratio, cutoff = 0.74, 0.75
				521	cruncher = SequenceMatcher(IS_CHARACTER_JUNK)
				522	eqi, eqj = None, None # 1st indices of equal lines (if any)
				523
				524	# search for the pair that matches best without being identical
				525	# (identical lines must be junk lines, & we don't want to synch up
				526	# on junk -- unless we have to)
				527	for j in xrange(blo, bhi):
				528	bj = b[j]
				529	cruncher.set_seq2(bj)
				530	for i in xrange(alo, ahi):
				531	ai = a[i]
				532	if ai == bj:
				533	if eqi is None:
				534	eqi, eqj = i, j
				535	continue
				536	cruncher.set_seq1(ai)
				537	# computing similarity is expensive, so use the quick
				538	# upper bounds first -- have seen this speed up messy
				539	# compares by a factor of 3.
				540	# note that ratio() is only expensive to compute the first
				541	# time it's called on a sequence pair; the expensive part
				542	# of the computation is cached by cruncher
				543	if cruncher.real_quick_ratio() > best_ratio and \
				544	cruncher.quick_ratio() > best_ratio and \
				545	cruncher.ratio() > best_ratio:
				546	best_ratio, best_i, best_j = cruncher.ratio(), i, j
				547	if best_ratio < cutoff:
				548	# no non-identical "pretty close" pair
				549	if eqi is None:
				550	# no identical pair either -- treat it as a straight replace
				551	plain_replace(a, alo, ahi, b, blo, bhi)
				552	return
				553	# no close pair, but an identical pair -- synch up on that
				554	best_i, best_j, best_ratio = eqi, eqj, 1.0
				555	else:
				556	# there's a close pair, so forget the identical pair (if any)
				557	eqi = None
				558
				559	# a[best_i] very similar to b[best_j]; eqi is None iff they're not
				560	# identical
				561	if TRACE:
				562	print '*** best_ratio', best_ratio, best_i, best_j
				563	dump('>', a, best_i, best_i+1)
				564	dump('<', b, best_j, best_j+1)
				565
				566	# pump out diffs from before the synch point
				567	fancy_helper(a, alo, best_i, b, blo, best_j)
				568
				569	# do intraline marking on the synch pair
				570	aelt, belt = a[best_i], b[best_j]
				571	if eqi is None:
				572	# pump out a '-', '+', '?' triple for the synched lines;
				573	atags = btags = ""
				574	cruncher.set_seqs(aelt, belt)
				575	for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
				576	la, lb = ai2 - ai1, bj2 - bj1
				577	if tag == 'replace':
				578	atags = atags + '.' * la
				579	btags = btags + '.' * lb
				580	elif tag == 'delete':
				581	atags = atags + '.' * la
				582	elif tag == 'insert':
				583	btags = btags + '.' * lb
				584	elif tag == 'equal':
				585	atags = atags + ' ' * la
				586	btags = btags + ' ' * lb
				587	else:
				588	raise ValueError, 'unknown tag ' + `tag`
				589	la, lb = len(atags), len(btags)
				590	if la < lb:
				591	atags = atags + ' ' * (lb - la)
				592	elif lb < la:
				593	btags = btags + ' ' * (la - lb)
				594	combined = map(lambda x,y: _combine[x+y], atags, btags)
				595	print '-', aelt, '+', belt, '?', \
				596	string.rstrip(string.join(combined, ''))
				597	else:
				598	# the synch pair is identical
				599	print ' ', aelt,
				600
				601	# pump out diffs from after the synch point
				602	fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)
				603
				604	def fancy_helper(a, alo, ahi, b, blo, bhi):
				605	if alo < ahi:
				606	if blo < bhi:
				607	fancy_replace(a, alo, ahi, b, blo, bhi)
				608	else:
				609	dump('-', a, alo, ahi)
				610	elif blo < bhi:
				611	dump('+', b, blo, bhi)
				612
				613	# open a file & return the file object; gripe and return 0 if it
				614	# couldn't be opened
				615	def fopen(fname):
				616	try:
				617	return open(fname, 'r')
				618	except IOError, detail:
				619	print "couldn't open " + fname + ": " + `detail`
				620	return 0
				621
				622	# open two files & spray the diff to stdout; return false iff a problem
				623	def fcompare(f1name, f2name):
				624	f1 = fopen(f1name)
				625	f2 = fopen(f2name)
				626	if not f1 or not f2:
				627	return 0
				628
				629	a = f1.readlines(); f1.close()
				630	b = f2.readlines(); f2.close()
				631
				632	cruncher = SequenceMatcher(IS_LINE_JUNK, a, b)
				633	for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
				634	if tag == 'replace':
				635	fancy_replace(a, alo, ahi, b, blo, bhi)
				636	elif tag == 'delete':
				637	dump('-', a, alo, ahi)
				638	elif tag == 'insert':
				639	dump('+', b, blo, bhi)
				640	elif tag == 'equal':
				641	dump(' ', a, alo, ahi)
				642	else:
				643	raise ValueError, 'unknown tag ' + `tag`
				644
				645	return 1
				646
				647	# get file names from argv & compare; return false iff a problem
				648	def main():
				649	from sys import argv
				650	if len(argv) != 3:
				651	print 'need 2 args'
				652	return 0
				653	[f1name, f2name] = argv[1:3]
				654	print '-:', f1name
				655	print '+:', f2name
				656	return fcompare(f1name, f2name)
				657
				658	if __name__ == '__main__':
				659	if 1:
				660	main()
				661	else:
				662	import profile, pstats
				663	statf = "ndiff.pro"
				664	profile.run("main()", statf)
				665	stats = pstats.Stats(statf)
				666	stats.strip_dirs().sort_stats('time').print_stats()
				667