Blame - Tools/scripts/ndiff.py - platform/external/python/cpython3

blob: 4767d1f0113495beffa17d30fb8978a02656a9fb [file] [log] [blame]

Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	1	#! /usr/bin/env python
				2
				3	# Released to the public domain $JustDate: 3/16/98 $,
				4	# by Tim Peters (email tim_one@email.msn.com).
				5
				6	# ndiff file1 file2 -- a human-friendly file differencer.
				7
				8	# $Revision$
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	9
				10	# SequenceMatcher tries to compute a "human-friendly diff" between
				11	# two sequences (chiefly picturing a file as a sequence of lines,
				12	# and a line as a sequence of characters, here). Unlike UNIX(tm) diff,
				13	# e.g., the fundamental notion is the longest contiguous & junk-free
				14	# matching subsequence. That's what catches peoples' eyes. The
				15	# Windows(tm) windiff has another interesting notion, pairing up elements
				16	# that appear uniquely in each sequence. That, and the method here,
				17	# appear to yield more intuitive difference reports than does diff. This
				18	# method appears to be the least vulnerable to synching up on blocks
				19	# of "junk lines", though (like blank lines in ordinary text files,
				20	# or maybe "<P>" lines in HTML files). That may be because this is
				21	# the only method of the 3 that has a concept of "junk" <wink>.
				22	#
				23	# Note that ndiff makes no claim to produce a minimal diff. To the
				24	# contrary, minimal diffs are often counter-intuitive, because they
				25	# synch up anywhere possible, sometimes accidental matches 100 pages
				26	# apart. Restricting synch points to contiguous matches preserves some
				27	# notion of locality, at the occasional cost of producing a longer diff.
				28	#
				29	# With respect to junk, an earlier verion of ndiff simply refused to
				30	# start a match with a junk element. The result was cases like this:
				31	# before: private Thread currentThread;
				32	# after: private volatile Thread currentThread;
				33	# If you consider whitespace to be junk, the longest continguous match
				34	# not starting with junk is "e Thread currentThread". So ndiff reported
				35	# that "e volatil" was inserted between the 't' and the 'e' in "private".
				36	# While an accurate view, to people that's absurd. The current version
				37	# looks for matching blocks that are entirely junk-free, then extends the
				38	# longest one of those as far as possible but only with matching junk.
				39	# So now "currentThread" is matched, then extended to suck up the
				40	# preceding blank; then "private" is matched, and extended to suck up the
				41	# following blank; then "Thread" is matched; and finally ndiff reports
				42	# that "volatile " was inserted before "Thread". The only quibble
				43	# remaining is that perhaps it was really the case that " volative"
				44	# was inserted after "private". I can live with that <wink>.
				45	#
				46	# NOTE on the output: From an ndiff report,
				47	# 1) The first file can be recovered by retaining only lines that begin
				48	# with " " or "- ", and deleting those 2-character prefixes.
				49	# 2) The second file can be recovered similarly, but by retaining only
				50	# " " and "+ " lines.
				51	# 3) Lines beginning with "? " attempt to guide the eye to intraline
				52	# differences, and were not present in either input file.
				53	#
Guido van Rossum	27c225e	1998-05-08 21:04:06 +0000	[diff] [blame]	54	# COROLLARY:
				55	# On Unix, the second file can be recovered by piping the output through
				56	# sed -n '/^[+ ] /s/^..//p'
				57	# Modifications to recover the first file are left as an exercise for
				58	# the reader.
				59	#
Guido van Rossum	83b8518	1998-05-06 17:43:30 +0000	[diff] [blame]	60	# NOTE on junk: the module-level names
				61	# IS_LINE_JUNK
				62	# IS_CHARACTER_JUNK
				63	# can be set to any functions you like. The first one should accept
				64	# a single string argument, and return true iff the string is junk.
				65	# The default is whether the regexp r"\s#?\s$" matches (i.e., a
				66	# line without visible characters, except for at most one splat).
				67	# The second should accept a string of length 1 etc. The default is
				68	# whether the character is a blank or tab (note: bad idea to include
				69	# newline in this!).
				70	#
				71	# After setting those, you can call fcompare(f1name, f2name) with the
				72	# names of the files you want to compare. The difference report
				73	# is sent to stdout. Or you can call main(), which expects to find
				74	# (exactly) the two file names in sys.argv.
				75
				76	import string
				77	TRACE = 0
				78
				79	# define what "junk" means
				80	import re
				81
				82	def IS_LINE_JUNK(line, pat=re.compile(r"\s#?\s$").match):
				83	return pat(line) is not None
				84
				85	def IS_CHARACTER_JUNK(ch, ws=" \t"):
				86	return ch in ws
				87
				88	del re
				89
				90	class SequenceMatcher:
				91	def __init__(self, isjunk=None, a='', b=''):
				92	# Members:
				93	# a
				94	# first sequence
				95	# b
				96	# second sequence; differences are computed as "what do
				97	# we need to do to 'a' to change it into 'b'?"
				98	# b2j
				99	# for x in b, b2j[x] is a list of the indices (into b)
				100	# at which x appears; junk elements do not appear
				101	# b2jhas
				102	# b2j.has_key
				103	# fullbcount
				104	# for x in b, fullbcount[x] == the number of times x
				105	# appears in b; only materialized if really needed (used
				106	# only for computing quick_ratio())
				107	# matching_blocks
				108	# a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
				109	# ascending & non-overlapping in i and in j; terminated by
				110	# a dummy (len(a), len(b), 0) sentinel
				111	# opcodes
				112	# a list of (tag, i1, i2, j1, j2) tuples, where tag is
				113	# one of
				114	# 'replace' a[i1:i2] should be replaced by b[j1:j2]
				115	# 'delete' a[i1:i2] should be deleted
				116	# 'insert' b[j1:j2] should be inserted
				117	# 'equal' a[i1:i2] == b[j1:j2]
				118	# isjunk
				119	# a user-supplied function taking a sequence element and
				120	# returning true iff the element is "junk" -- this has
				121	# subtle but helpful effects on the algorithm, which I'll
				122	# get around to writing up someday <0.9 wink>.
				123	# DON'T USE! Only __chain_b uses this. Use isbjunk.
				124	# isbjunk
				125	# for x in b, isbjunk(x) == isjunk(x) but much faster;
				126	# it's really the has_key method of a hidden dict.
				127	# DOES NOT WORK for x in a!
				128
				129	self.isjunk = isjunk
				130	self.a = self.b = None
				131	self.set_seqs(a, b)
				132
				133	def set_seqs(self, a, b):
				134	self.set_seq1(a)
				135	self.set_seq2(b)
				136
				137	def set_seq1(self, a):
				138	if a is self.a:
				139	return
				140	self.a = a
				141	self.matching_blocks = self.opcodes = None
				142
				143	def set_seq2(self, b):
				144	if b is self.b:
				145	return
				146	self.b = b
				147	self.matching_blocks = self.opcodes = None
				148	self.fullbcount = None
				149	self.__chain_b()
				150
				151	# for each element x in b, set b2j[x] to a list of the indices in
				152	# b where x appears; the indices are in increasing order; note that
				153	# the number of times x appears in b is len(b2j[x]) ...
				154	# when self.isjunk is defined, junk elements don't show up in this
				155	# map at all, which stops the central find_longest_match method
				156	# from starting any matching block at a junk element ...
				157	# also creates the fast isbjunk function ...
				158	# note that this is only called when b changes; so for cross-product
				159	# kinds of matches, it's best to call set_seq2 once, then set_seq1
				160	# repeatedly
				161
				162	def __chain_b(self):
				163	# Because isjunk is a user-defined (not C) function, and we test
				164	# for junk a LOT, it's important to minimize the number of calls.
				165	# Before the tricks described here, __chain_b was by far the most
				166	# time-consuming routine in the whole module! If anyone sees
				167	# Jim Roskind, thank him again for profile.py -- I never would
				168	# have guessed that.
				169	# The first trick is to build b2j ignoring the possibility
				170	# of junk. I.e., we don't call isjunk at all yet. Throwing
				171	# out the junk later is much cheaper than building b2j "right"
				172	# from the start.
				173	b = self.b
				174	self.b2j = b2j = {}
				175	self.b2jhas = b2jhas = b2j.has_key
				176	for i in xrange(0, len(b)):
				177	elt = b[i]
				178	if b2jhas(elt):
				179	b2j[elt].append(i)
				180	else:
				181	b2j[elt] = [i]
				182
				183	# Now b2j.keys() contains elements uniquely, and especially when
				184	# the sequence is a string, that's usually a good deal smaller
				185	# than len(string). The difference is the number of isjunk calls
				186	# saved.
				187	isjunk, junkdict = self.isjunk, {}
				188	if isjunk:
				189	for elt in b2j.keys():
				190	if isjunk(elt):
				191	junkdict[elt] = 1 # value irrelevant; it's a set
				192	del b2j[elt]
				193
				194	# Now for x in b, isjunk(x) == junkdict.has_key(x), but the
				195	# latter is much faster. Note too that while there may be a
				196	# lot of junk in the sequence, the number of unique junk
				197	# elements is probably small. So the memory burden of keeping
				198	# this dict alive is likely trivial compared to the size of b2j.
				199	self.isbjunk = junkdict.has_key
				200
				201	def find_longest_match(self, alo, ahi, blo, bhi):
				202	"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
				203
				204	If isjunk is not defined:
				205
				206	Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
				207	alo <= i <= i+k <= ahi
				208	blo <= j <= j+k <= bhi
				209	and for all (i',j',k') meeting those conditions,
				210	k >= k'
				211	i <= i'
				212	and if i == i', j <= j'
				213	In other words, of all maximal matching blocks, returns one
				214	that starts earliest in a, and of all those maximal matching
				215	blocks that start earliest in a, returns the one that starts
				216	earliest in b.
				217
				218	If isjunk is defined, first the longest matching block is
				219	determined as above, but with the additional restriction that
				220	no junk element appears in the block. Then that block is
				221	extended as far as possible by matching (only) junk elements on
				222	both sides. So the resulting block never matches on junk except
				223	as identical junk happens to be adjacent to an "interesting"
				224	match.
				225
				226	If no blocks match, returns (alo, blo, 0).
				227	"""
				228
				229	# CAUTION: stripping common prefix or suffix would be incorrect.
				230	# E.g.,
				231	# ab
				232	# acab
				233	# Longest matching block is "ab", but if common prefix is
				234	# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
				235	# strip, so ends up claiming that ab is changed to acab by
				236	# inserting "ca" in the middle. That's minimal but unintuitive:
				237	# "it's obvious" that someone inserted "ac" at the front.
				238	# Windiff ends up at the same place as diff, but by pairing up
				239	# the unique 'b's and then matching the first two 'a's.
				240
				241	# find longest junk-free match
				242	a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
				243	besti, bestj, bestsize = alo, blo, 0
				244	for i in xrange(alo, ahi):
				245	# check for longest match starting at a[i]
				246	if i + bestsize >= ahi:
				247	# we're too far right to get a new best
				248	break
				249	# look at all instances of a[i] in b; note that because
				250	# b2j has no junk keys, the loop is skipped if a[i] is junk
				251	for j in b2j.get(a[i], []):
				252	# a[i] matches b[j]
				253	if j < blo:
				254	continue
				255	if j + bestsize >= bhi:
				256	# we're too far right to get a new best, here or
				257	# anywhere to the right
				258	break
				259	if a[i + bestsize] != b[j + bestsize]:
				260	# can't be longer match; this test is not necessary
				261	# for correctness, but is a huge win for efficiency
				262	continue
				263	# set k to length of match
				264	k = 1 # a[i] == b[j] already known
				265	while i + k < ahi and j + k < bhi and \
				266	a[i+k] == b[j+k] and not isbjunk(b[j+k]):
				267	k = k + 1
				268	if k > bestsize:
				269	besti, bestj, bestsize = i, j, k
				270	if i + bestsize >= ahi:
				271	# only time in my life I really wanted a
				272	# labelled break <wink> -- we're done with
				273	# both loops now
				274	break
				275
				276	# Now that we have a wholly interesting match (albeit possibly
				277	# empty!), we may as well suck up the matching junk on each
				278	# side of it too. Can't think of a good reason not to, and it
				279	# saves post-processing the (possibly considerable) expense of
				280	# figuring out what to do with it. In the case of an empty
				281	# interesting match, this is clearly the right thing to do,
				282	# because no other kind of match is possible in the regions.
				283	while besti > alo and bestj > blo and \
				284	isbjunk(b[bestj-1]) and \
				285	a[besti-1] == b[bestj-1]:
				286	besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
				287	while besti+bestsize < ahi and bestj+bestsize < bhi and \
				288	isbjunk(b[bestj+bestsize]) and \
				289	a[besti+bestsize] == b[bestj+bestsize]:
				290	bestsize = bestsize + 1
				291
				292	if TRACE:
				293	print "get_matching_blocks", alo, ahi, blo, bhi
				294	print " returns", besti, bestj, bestsize
				295	return besti, bestj, bestsize
				296
				297	# A different implementation, using a binary doubling technique that
				298	# does far fewer element compares (trades 'em for integer compares),
				299	# and has n*lg n worst-case behavior. Alas, the code is much harder
				300	# to follow (the details are tricky!), and in most cases I've seen,
				301	# it takes at least 50% longer than the "clever dumb" method above;
				302	# probably due to creating layers of small dicts.
				303	# NOTE: this no longer matches the version above wrt junk; remains
				304	# too unpromising to update it; someday, though ...
				305
				306	# def find_longest_match(self, alo, ahi, blo, bhi):
				307	# """Find longest matching block in a[alo:ahi] and b[blo:bhi].
				308	#
				309	# Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
				310	# alo <= i <= i+k <= ahi
				311	# blo <= j <= j+k <= bhi
				312	# and for all (i',j',k') meeting those conditions,
				313	# k >= k'
				314	# i <= i'
				315	# and if i == i', j <= j'
				316	# In other words, of all maximal matching blocks, returns one
				317	# that starts earliest in a, and of all those maximal matching
				318	# blocks that start earliest in a, returns the one that starts
				319	# earliest in b.
				320	#
				321	# If no blocks match, returns (alo, blo, 0).
				322	# """
				323	#
				324	# a, b2j = self.a, self.b2j
				325	# # alljs[size][i] is a set of all j's s.t. a[i:i+len] matches
				326	# # b[j:j+len]
				327	# alljs = {}
				328	# alljs[1] = js = {}
				329	# ahits = {}
				330	# for i in xrange(alo, ahi):
				331	# elt = a[i]
				332	# if ahits.has_key(elt):
				333	# js[i] = ahits[elt]
				334	# continue
				335	# if b2j.has_key(elt):
				336	# in_range = {}
				337	# for j in b2j[elt]:
				338	# if j >= blo:
				339	# if j >= bhi:
				340	# break
				341	# in_range[j] = 1
				342	# if in_range:
				343	# ahits[elt] = js[i] = in_range
				344	# del ahits
				345	# size = 1
				346	# while js:
				347	# oldsize = size
				348	# size = size + size
				349	# oldjs = js
				350	# alljs[size] = js = {}
				351	# for i in oldjs.keys():
				352	# # i has matches of size oldsize
				353	# if not oldjs.has_key(i + oldsize):
				354	# # can't double it
				355	# continue
				356	# second_js = oldjs[i + oldsize]
				357	# answer = {}
				358	# for j in oldjs[i].keys():
				359	# if second_js.has_key(j + oldsize):
				360	# answer[j] = 1
				361	# if answer:
				362	# js[i] = answer
				363	# del alljs[size]
				364	# size = size >> 1 # max power of 2 with a match
				365	# if not size:
				366	# return alo, blo, 0
				367	# besti, bestj, bestsize = alo, blo, 0
				368	# fatis = alljs[size].keys()
				369	# fatis.sort()
				370	# for i in fatis:
				371	# # figure out longest match starting at a[i]
				372	# totalsize = halfsize = size
				373	# # i has matches of len totalsize at the indices in js
				374	# js = alljs[size][i].keys()
				375	# while halfsize > 1:
				376	# halfsize = halfsize >> 1
				377	# # is there a match of len halfsize starting at
				378	# # i + totalsize?
				379	# newjs = []
				380	# if alljs[halfsize].has_key(i + totalsize):
				381	# second_js = alljs[halfsize][i + totalsize]
				382	# for j in js:
				383	# if second_js.has_key(j + totalsize):
				384	# newjs.append(j)
				385	# if newjs:
				386	# totalsize = totalsize + halfsize
				387	# js = newjs
				388	# if totalsize > bestsize:
				389	# besti, bestj, bestsize = i, min(js), totalsize
				390	# return besti, bestj, bestsize
				391
				392	def get_matching_blocks(self):
				393	if self.matching_blocks is not None:
				394	return self.matching_blocks
				395	self.matching_blocks = []
				396	la, lb = len(self.a), len(self.b)
				397	self.__helper(0, la, 0, lb, self.matching_blocks)
				398	self.matching_blocks.append( (la, lb, 0) )
				399	if TRACE:
				400	print '*** matching blocks', self.matching_blocks
				401	return self.matching_blocks
				402
				403	# builds list of matching blocks covering a[alo:ahi] and
				404	# b[blo:bhi], appending them in increasing order to answer
				405
				406	def __helper(self, alo, ahi, blo, bhi, answer):
				407	i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
				408	# a[alo:i] vs b[blo:j] unknown
				409	# a[i:i+k] same as b[j:j+k]
				410	# a[i+k:ahi] vs b[j+k:bhi] unknown
				411	if k:
				412	if alo < i and blo < j:
				413	self.__helper(alo, i, blo, j, answer)
				414	answer.append( x )
				415	if i+k < ahi and j+k < bhi:
				416	self.__helper(i+k, ahi, j+k, bhi, answer)
				417
				418	def ratio(self):
				419	"""Return a measure of the sequences' similarity (float in [0,1]).
				420
				421	Where T is the total number of elements in both sequences, and
				422	M is the number of matches, this is 2*M / T.
				423	Note that this is 1 if the sequences are identical, and 0 if
				424	they have nothing in common.
				425	"""
				426
				427	matches = reduce(lambda sum, triple: sum + triple[-1],
				428	self.get_matching_blocks(), 0)
				429	return 2.0 * matches / (len(self.a) + len(self.b))
				430
				431	def quick_ratio(self):
				432	"""Return an upper bound on ratio() relatively quickly."""
				433	# viewing a and b as multisets, set matches to the cardinality
				434	# of their intersection; this counts the number of matches
				435	# without regard to order, so is clearly an upper bound
				436	if self.fullbcount is None:
				437	self.fullbcount = fullbcount = {}
				438	for elt in self.b:
				439	fullbcount[elt] = fullbcount.get(elt, 0) + 1
				440	fullbcount = self.fullbcount
				441	# avail[x] is the number of times x appears in 'b' less the
				442	# number of times we've seen it in 'a' so far ... kinda
				443	avail = {}
				444	availhas, matches = avail.has_key, 0
				445	for elt in self.a:
				446	if availhas(elt):
				447	numb = avail[elt]
				448	else:
				449	numb = fullbcount.get(elt, 0)
				450	avail[elt] = numb - 1
				451	if numb > 0:
				452	matches = matches + 1
				453	return 2.0 * matches / (len(self.a) + len(self.b))
				454
				455	def real_quick_ratio(self):
				456	"""Return an upper bound on ratio() very quickly"""
				457	la, lb = len(self.a), len(self.b)
				458	# can't have more matches than the number of elements in the
				459	# shorter sequence
				460	return 2.0 * min(la, lb) / (la + lb)
				461
				462	def get_opcodes(self):
				463	if self.opcodes is not None:
				464	return self.opcodes
				465	i = j = 0
				466	self.opcodes = answer = []
				467	for ai, bj, size in self.get_matching_blocks():
				468	# invariant: we've pumped out correct diffs to change
				469	# a[:i] into b[:j], and the next matching block is
				470	# a[ai:ai+size] == b[bj:bj+size]. So we need to pump
				471	# out a diff to change a[i:ai] into b[j:bj], pump out
				472	# the matching block, and move (i,j) beyond the match
				473	tag = ''
				474	if i < ai and j < bj:
				475	tag = 'replace'
				476	elif i < ai:
				477	tag = 'delete'
				478	elif j < bj:
				479	tag = 'insert'
				480	if tag:
				481	answer.append( (tag, i, ai, j, bj) )
				482	i, j = ai+size, bj+size
				483	# the list of matching blocks is terminated by a
				484	# sentinel with size 0
				485	if size:
				486	answer.append( ('equal', ai, i, bj, j) )
				487	return answer
				488
				489	# meant for dumping lines
				490	def dump(tag, x, lo, hi):
				491	for i in xrange(lo, hi):
				492	print tag, x[i],
				493
				494	# figure out which mark to stick under characters in lines that
				495	# have changed (blank = same, - = deleted, + = inserted, ^ = replaced)
				496	_combine = { ' ': ' ',
				497	'. ': '-',
				498	' .': '+',
				499	'..': '^' }
				500
				501	def plain_replace(a, alo, ahi, b, blo, bhi):
				502	assert alo < ahi and blo < bhi
				503	# dump the shorter block first -- reduces the burden on short-term
				504	# memory if the blocks are of very different sizes
				505	if bhi - blo < ahi - alo:
				506	dump('+', b, blo, bhi)
				507	dump('-', a, alo, ahi)
				508	else:
				509	dump('-', a, alo, ahi)
				510	dump('+', b, blo, bhi)
				511
				512	# When replacing one block of lines with another, this guy searches
				513	# the blocks for similar lines; the best-matching pair (if any) is
				514	# used as a synch point, and intraline difference marking is done on
				515	# the similar pair. Lots of work, but often worth it.
				516
				517	def fancy_replace(a, alo, ahi, b, blo, bhi):
				518	if TRACE:
				519	print '*** fancy_replace', alo, ahi, blo, bhi
				520	dump('>', a, alo, ahi)
				521	dump('<', b, blo, bhi)
				522
				523	# don't synch up unless the lines have a similarity score of at
				524	# least cutoff; best_ratio tracks the best score seen so far
				525	best_ratio, cutoff = 0.74, 0.75
				526	cruncher = SequenceMatcher(IS_CHARACTER_JUNK)
				527	eqi, eqj = None, None # 1st indices of equal lines (if any)
				528
				529	# search for the pair that matches best without being identical
				530	# (identical lines must be junk lines, & we don't want to synch up
				531	# on junk -- unless we have to)
				532	for j in xrange(blo, bhi):
				533	bj = b[j]
				534	cruncher.set_seq2(bj)
				535	for i in xrange(alo, ahi):
				536	ai = a[i]
				537	if ai == bj:
				538	if eqi is None:
				539	eqi, eqj = i, j
				540	continue
				541	cruncher.set_seq1(ai)
				542	# computing similarity is expensive, so use the quick
				543	# upper bounds first -- have seen this speed up messy
				544	# compares by a factor of 3.
				545	# note that ratio() is only expensive to compute the first
				546	# time it's called on a sequence pair; the expensive part
				547	# of the computation is cached by cruncher
				548	if cruncher.real_quick_ratio() > best_ratio and \
				549	cruncher.quick_ratio() > best_ratio and \
				550	cruncher.ratio() > best_ratio:
				551	best_ratio, best_i, best_j = cruncher.ratio(), i, j
				552	if best_ratio < cutoff:
				553	# no non-identical "pretty close" pair
				554	if eqi is None:
				555	# no identical pair either -- treat it as a straight replace
				556	plain_replace(a, alo, ahi, b, blo, bhi)
				557	return
				558	# no close pair, but an identical pair -- synch up on that
				559	best_i, best_j, best_ratio = eqi, eqj, 1.0
				560	else:
				561	# there's a close pair, so forget the identical pair (if any)
				562	eqi = None
				563
				564	# a[best_i] very similar to b[best_j]; eqi is None iff they're not
				565	# identical
				566	if TRACE:
				567	print '*** best_ratio', best_ratio, best_i, best_j
				568	dump('>', a, best_i, best_i+1)
				569	dump('<', b, best_j, best_j+1)
				570
				571	# pump out diffs from before the synch point
				572	fancy_helper(a, alo, best_i, b, blo, best_j)
				573
				574	# do intraline marking on the synch pair
				575	aelt, belt = a[best_i], b[best_j]
				576	if eqi is None:
				577	# pump out a '-', '+', '?' triple for the synched lines;
				578	atags = btags = ""
				579	cruncher.set_seqs(aelt, belt)
				580	for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
				581	la, lb = ai2 - ai1, bj2 - bj1
				582	if tag == 'replace':
				583	atags = atags + '.' * la
				584	btags = btags + '.' * lb
				585	elif tag == 'delete':
				586	atags = atags + '.' * la
				587	elif tag == 'insert':
				588	btags = btags + '.' * lb
				589	elif tag == 'equal':
				590	atags = atags + ' ' * la
				591	btags = btags + ' ' * lb
				592	else:
				593	raise ValueError, 'unknown tag ' + `tag`
				594	la, lb = len(atags), len(btags)
				595	if la < lb:
				596	atags = atags + ' ' * (lb - la)
				597	elif lb < la:
				598	btags = btags + ' ' * (la - lb)
				599	combined = map(lambda x,y: _combine[x+y], atags, btags)
				600	print '-', aelt, '+', belt, '?', \
				601	string.rstrip(string.join(combined, ''))
				602	else:
				603	# the synch pair is identical
				604	print ' ', aelt,
				605
				606	# pump out diffs from after the synch point
				607	fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)
				608
				609	def fancy_helper(a, alo, ahi, b, blo, bhi):
				610	if alo < ahi:
				611	if blo < bhi:
				612	fancy_replace(a, alo, ahi, b, blo, bhi)
				613	else:
				614	dump('-', a, alo, ahi)
				615	elif blo < bhi:
				616	dump('+', b, blo, bhi)
				617
				618	# open a file & return the file object; gripe and return 0 if it
				619	# couldn't be opened
				620	def fopen(fname):
				621	try:
				622	return open(fname, 'r')
				623	except IOError, detail:
				624	print "couldn't open " + fname + ": " + `detail`
				625	return 0
				626
				627	# open two files & spray the diff to stdout; return false iff a problem
				628	def fcompare(f1name, f2name):
				629	f1 = fopen(f1name)
				630	f2 = fopen(f2name)
				631	if not f1 or not f2:
				632	return 0
				633
				634	a = f1.readlines(); f1.close()
				635	b = f2.readlines(); f2.close()
				636
				637	cruncher = SequenceMatcher(IS_LINE_JUNK, a, b)
				638	for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
				639	if tag == 'replace':
				640	fancy_replace(a, alo, ahi, b, blo, bhi)
				641	elif tag == 'delete':
				642	dump('-', a, alo, ahi)
				643	elif tag == 'insert':
				644	dump('+', b, blo, bhi)
				645	elif tag == 'equal':
				646	dump(' ', a, alo, ahi)
				647	else:
				648	raise ValueError, 'unknown tag ' + `tag`
				649
				650	return 1
				651
				652	# get file names from argv & compare; return false iff a problem
				653	def main():
				654	from sys import argv
				655	if len(argv) != 3:
				656	print 'need 2 args'
				657	return 0
				658	[f1name, f2name] = argv[1:3]
				659	print '-:', f1name
				660	print '+:', f2name
				661	return fcompare(f1name, f2name)
				662
				663	if __name__ == '__main__':
				664	if 1:
				665	main()
				666	else:
				667	import profile, pstats
				668	statf = "ndiff.pro"
				669	profile.run("main()", statf)
				670	stats = pstats.Stats(statf)
				671	stats.strip_dirs().sort_stats('time').print_stats()
				672