blob: 4767d1f0113495beffa17d30fb8978a02656a9fb [file] [log] [blame]
Guido van Rossum83b85181998-05-06 17:43:30 +00001#! /usr/bin/env python
2
3# Released to the public domain $JustDate: 3/16/98 $,
4# by Tim Peters (email tim_one@email.msn.com).
5
6# ndiff file1 file2 -- a human-friendly file differencer.
7
8# $Revision$
Guido van Rossum83b85181998-05-06 17:43:30 +00009
10# SequenceMatcher tries to compute a "human-friendly diff" between
11# two sequences (chiefly picturing a file as a sequence of lines,
12# and a line as a sequence of characters, here). Unlike UNIX(tm) diff,
13# e.g., the fundamental notion is the longest *contiguous* & junk-free
14# matching subsequence. That's what catches peoples' eyes. The
15# Windows(tm) windiff has another interesting notion, pairing up elements
16# that appear uniquely in each sequence. That, and the method here,
17# appear to yield more intuitive difference reports than does diff. This
18# method appears to be the least vulnerable to synching up on blocks
19# of "junk lines", though (like blank lines in ordinary text files,
20# or maybe "<P>" lines in HTML files). That may be because this is
21# the only method of the 3 that has a *concept* of "junk" <wink>.
22#
23# Note that ndiff makes no claim to produce a *minimal* diff. To the
24# contrary, minimal diffs are often counter-intuitive, because they
25# synch up anywhere possible, sometimes accidental matches 100 pages
26# apart. Restricting synch points to contiguous matches preserves some
27# notion of locality, at the occasional cost of producing a longer diff.
28#
29# With respect to junk, an earlier verion of ndiff simply refused to
30# *start* a match with a junk element. The result was cases like this:
31# before: private Thread currentThread;
32# after: private volatile Thread currentThread;
33# If you consider whitespace to be junk, the longest continguous match
34# not starting with junk is "e Thread currentThread". So ndiff reported
35# that "e volatil" was inserted between the 't' and the 'e' in "private".
36# While an accurate view, to people that's absurd. The current version
37# looks for matching blocks that are entirely junk-free, then extends the
38# longest one of those as far as possible but only with matching junk.
39# So now "currentThread" is matched, then extended to suck up the
40# preceding blank; then "private" is matched, and extended to suck up the
41# following blank; then "Thread" is matched; and finally ndiff reports
42# that "volatile " was inserted before "Thread". The only quibble
43# remaining is that perhaps it was really the case that " volative"
44# was inserted after "private". I can live with that <wink>.
45#
46# NOTE on the output: From an ndiff report,
47# 1) The first file can be recovered by retaining only lines that begin
48# with " " or "- ", and deleting those 2-character prefixes.
49# 2) The second file can be recovered similarly, but by retaining only
50# " " and "+ " lines.
51# 3) Lines beginning with "? " attempt to guide the eye to intraline
52# differences, and were not present in either input file.
53#
Guido van Rossum27c225e1998-05-08 21:04:06 +000054# COROLLARY:
55# On Unix, the second file can be recovered by piping the output through
56# sed -n '/^[+ ] /s/^..//p'
57# Modifications to recover the first file are left as an exercise for
58# the reader.
59#
Guido van Rossum83b85181998-05-06 17:43:30 +000060# NOTE on junk: the module-level names
61# IS_LINE_JUNK
62# IS_CHARACTER_JUNK
63# can be set to any functions you like. The first one should accept
64# a single string argument, and return true iff the string is junk.
65# The default is whether the regexp r"\s*#?\s*$" matches (i.e., a
66# line without visible characters, except for at most one splat).
67# The second should accept a string of length 1 etc. The default is
68# whether the character is a blank or tab (note: bad idea to include
69# newline in this!).
70#
71# After setting those, you can call fcompare(f1name, f2name) with the
72# names of the files you want to compare. The difference report
73# is sent to stdout. Or you can call main(), which expects to find
74# (exactly) the two file names in sys.argv.
75
76import string
77TRACE = 0
78
79# define what "junk" means
80import re
81
82def IS_LINE_JUNK(line, pat=re.compile(r"\s*#?\s*$").match):
83 return pat(line) is not None
84
85def IS_CHARACTER_JUNK(ch, ws=" \t"):
86 return ch in ws
87
88del re
89
90class SequenceMatcher:
91 def __init__(self, isjunk=None, a='', b=''):
92 # Members:
93 # a
94 # first sequence
95 # b
96 # second sequence; differences are computed as "what do
97 # we need to do to 'a' to change it into 'b'?"
98 # b2j
99 # for x in b, b2j[x] is a list of the indices (into b)
100 # at which x appears; junk elements do not appear
101 # b2jhas
102 # b2j.has_key
103 # fullbcount
104 # for x in b, fullbcount[x] == the number of times x
105 # appears in b; only materialized if really needed (used
106 # only for computing quick_ratio())
107 # matching_blocks
108 # a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
109 # ascending & non-overlapping in i and in j; terminated by
110 # a dummy (len(a), len(b), 0) sentinel
111 # opcodes
112 # a list of (tag, i1, i2, j1, j2) tuples, where tag is
113 # one of
114 # 'replace' a[i1:i2] should be replaced by b[j1:j2]
115 # 'delete' a[i1:i2] should be deleted
116 # 'insert' b[j1:j2] should be inserted
117 # 'equal' a[i1:i2] == b[j1:j2]
118 # isjunk
119 # a user-supplied function taking a sequence element and
120 # returning true iff the element is "junk" -- this has
121 # subtle but helpful effects on the algorithm, which I'll
122 # get around to writing up someday <0.9 wink>.
123 # DON'T USE! Only __chain_b uses this. Use isbjunk.
124 # isbjunk
125 # for x in b, isbjunk(x) == isjunk(x) but much faster;
126 # it's really the has_key method of a hidden dict.
127 # DOES NOT WORK for x in a!
128
129 self.isjunk = isjunk
130 self.a = self.b = None
131 self.set_seqs(a, b)
132
133 def set_seqs(self, a, b):
134 self.set_seq1(a)
135 self.set_seq2(b)
136
137 def set_seq1(self, a):
138 if a is self.a:
139 return
140 self.a = a
141 self.matching_blocks = self.opcodes = None
142
143 def set_seq2(self, b):
144 if b is self.b:
145 return
146 self.b = b
147 self.matching_blocks = self.opcodes = None
148 self.fullbcount = None
149 self.__chain_b()
150
151 # for each element x in b, set b2j[x] to a list of the indices in
152 # b where x appears; the indices are in increasing order; note that
153 # the number of times x appears in b is len(b2j[x]) ...
154 # when self.isjunk is defined, junk elements don't show up in this
155 # map at all, which stops the central find_longest_match method
156 # from starting any matching block at a junk element ...
157 # also creates the fast isbjunk function ...
158 # note that this is only called when b changes; so for cross-product
159 # kinds of matches, it's best to call set_seq2 once, then set_seq1
160 # repeatedly
161
162 def __chain_b(self):
163 # Because isjunk is a user-defined (not C) function, and we test
164 # for junk a LOT, it's important to minimize the number of calls.
165 # Before the tricks described here, __chain_b was by far the most
166 # time-consuming routine in the whole module! If anyone sees
167 # Jim Roskind, thank him again for profile.py -- I never would
168 # have guessed that.
169 # The first trick is to build b2j ignoring the possibility
170 # of junk. I.e., we don't call isjunk at all yet. Throwing
171 # out the junk later is much cheaper than building b2j "right"
172 # from the start.
173 b = self.b
174 self.b2j = b2j = {}
175 self.b2jhas = b2jhas = b2j.has_key
176 for i in xrange(0, len(b)):
177 elt = b[i]
178 if b2jhas(elt):
179 b2j[elt].append(i)
180 else:
181 b2j[elt] = [i]
182
183 # Now b2j.keys() contains elements uniquely, and especially when
184 # the sequence is a string, that's usually a good deal smaller
185 # than len(string). The difference is the number of isjunk calls
186 # saved.
187 isjunk, junkdict = self.isjunk, {}
188 if isjunk:
189 for elt in b2j.keys():
190 if isjunk(elt):
191 junkdict[elt] = 1 # value irrelevant; it's a set
192 del b2j[elt]
193
194 # Now for x in b, isjunk(x) == junkdict.has_key(x), but the
195 # latter is much faster. Note too that while there may be a
196 # lot of junk in the sequence, the number of *unique* junk
197 # elements is probably small. So the memory burden of keeping
198 # this dict alive is likely trivial compared to the size of b2j.
199 self.isbjunk = junkdict.has_key
200
201 def find_longest_match(self, alo, ahi, blo, bhi):
202 """Find longest matching block in a[alo:ahi] and b[blo:bhi].
203
204 If isjunk is not defined:
205
206 Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
207 alo <= i <= i+k <= ahi
208 blo <= j <= j+k <= bhi
209 and for all (i',j',k') meeting those conditions,
210 k >= k'
211 i <= i'
212 and if i == i', j <= j'
213 In other words, of all maximal matching blocks, returns one
214 that starts earliest in a, and of all those maximal matching
215 blocks that start earliest in a, returns the one that starts
216 earliest in b.
217
218 If isjunk is defined, first the longest matching block is
219 determined as above, but with the additional restriction that
220 no junk element appears in the block. Then that block is
221 extended as far as possible by matching (only) junk elements on
222 both sides. So the resulting block never matches on junk except
223 as identical junk happens to be adjacent to an "interesting"
224 match.
225
226 If no blocks match, returns (alo, blo, 0).
227 """
228
229 # CAUTION: stripping common prefix or suffix would be incorrect.
230 # E.g.,
231 # ab
232 # acab
233 # Longest matching block is "ab", but if common prefix is
234 # stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
235 # strip, so ends up claiming that ab is changed to acab by
236 # inserting "ca" in the middle. That's minimal but unintuitive:
237 # "it's obvious" that someone inserted "ac" at the front.
238 # Windiff ends up at the same place as diff, but by pairing up
239 # the unique 'b's and then matching the first two 'a's.
240
241 # find longest junk-free match
242 a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
243 besti, bestj, bestsize = alo, blo, 0
244 for i in xrange(alo, ahi):
245 # check for longest match starting at a[i]
246 if i + bestsize >= ahi:
247 # we're too far right to get a new best
248 break
249 # look at all instances of a[i] in b; note that because
250 # b2j has no junk keys, the loop is skipped if a[i] is junk
251 for j in b2j.get(a[i], []):
252 # a[i] matches b[j]
253 if j < blo:
254 continue
255 if j + bestsize >= bhi:
256 # we're too far right to get a new best, here or
257 # anywhere to the right
258 break
259 if a[i + bestsize] != b[j + bestsize]:
260 # can't be longer match; this test is not necessary
261 # for correctness, but is a huge win for efficiency
262 continue
263 # set k to length of match
264 k = 1 # a[i] == b[j] already known
265 while i + k < ahi and j + k < bhi and \
266 a[i+k] == b[j+k] and not isbjunk(b[j+k]):
267 k = k + 1
268 if k > bestsize:
269 besti, bestj, bestsize = i, j, k
270 if i + bestsize >= ahi:
271 # only time in my life I really wanted a
272 # labelled break <wink> -- we're done with
273 # both loops now
274 break
275
276 # Now that we have a wholly interesting match (albeit possibly
277 # empty!), we may as well suck up the matching junk on each
278 # side of it too. Can't think of a good reason not to, and it
279 # saves post-processing the (possibly considerable) expense of
280 # figuring out what to do with it. In the case of an empty
281 # interesting match, this is clearly the right thing to do,
282 # because no other kind of match is possible in the regions.
283 while besti > alo and bestj > blo and \
284 isbjunk(b[bestj-1]) and \
285 a[besti-1] == b[bestj-1]:
286 besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
287 while besti+bestsize < ahi and bestj+bestsize < bhi and \
288 isbjunk(b[bestj+bestsize]) and \
289 a[besti+bestsize] == b[bestj+bestsize]:
290 bestsize = bestsize + 1
291
292 if TRACE:
293 print "get_matching_blocks", alo, ahi, blo, bhi
294 print " returns", besti, bestj, bestsize
295 return besti, bestj, bestsize
296
297# A different implementation, using a binary doubling technique that
298# does far fewer element compares (trades 'em for integer compares),
299# and has n*lg n worst-case behavior. Alas, the code is much harder
300# to follow (the details are tricky!), and in most cases I've seen,
301# it takes at least 50% longer than the "clever dumb" method above;
302# probably due to creating layers of small dicts.
303# NOTE: this no longer matches the version above wrt junk; remains
304# too unpromising to update it; someday, though ...
305
306# def find_longest_match(self, alo, ahi, blo, bhi):
307# """Find longest matching block in a[alo:ahi] and b[blo:bhi].
308#
309# Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
310# alo <= i <= i+k <= ahi
311# blo <= j <= j+k <= bhi
312# and for all (i',j',k') meeting those conditions,
313# k >= k'
314# i <= i'
315# and if i == i', j <= j'
316# In other words, of all maximal matching blocks, returns one
317# that starts earliest in a, and of all those maximal matching
318# blocks that start earliest in a, returns the one that starts
319# earliest in b.
320#
321# If no blocks match, returns (alo, blo, 0).
322# """
323#
324# a, b2j = self.a, self.b2j
325# # alljs[size][i] is a set of all j's s.t. a[i:i+len] matches
326# # b[j:j+len]
327# alljs = {}
328# alljs[1] = js = {}
329# ahits = {}
330# for i in xrange(alo, ahi):
331# elt = a[i]
332# if ahits.has_key(elt):
333# js[i] = ahits[elt]
334# continue
335# if b2j.has_key(elt):
336# in_range = {}
337# for j in b2j[elt]:
338# if j >= blo:
339# if j >= bhi:
340# break
341# in_range[j] = 1
342# if in_range:
343# ahits[elt] = js[i] = in_range
344# del ahits
345# size = 1
346# while js:
347# oldsize = size
348# size = size + size
349# oldjs = js
350# alljs[size] = js = {}
351# for i in oldjs.keys():
352# # i has matches of size oldsize
353# if not oldjs.has_key(i + oldsize):
354# # can't double it
355# continue
356# second_js = oldjs[i + oldsize]
357# answer = {}
358# for j in oldjs[i].keys():
359# if second_js.has_key(j + oldsize):
360# answer[j] = 1
361# if answer:
362# js[i] = answer
363# del alljs[size]
364# size = size >> 1 # max power of 2 with a match
365# if not size:
366# return alo, blo, 0
367# besti, bestj, bestsize = alo, blo, 0
368# fatis = alljs[size].keys()
369# fatis.sort()
370# for i in fatis:
371# # figure out longest match starting at a[i]
372# totalsize = halfsize = size
373# # i has matches of len totalsize at the indices in js
374# js = alljs[size][i].keys()
375# while halfsize > 1:
376# halfsize = halfsize >> 1
377# # is there a match of len halfsize starting at
378# # i + totalsize?
379# newjs = []
380# if alljs[halfsize].has_key(i + totalsize):
381# second_js = alljs[halfsize][i + totalsize]
382# for j in js:
383# if second_js.has_key(j + totalsize):
384# newjs.append(j)
385# if newjs:
386# totalsize = totalsize + halfsize
387# js = newjs
388# if totalsize > bestsize:
389# besti, bestj, bestsize = i, min(js), totalsize
390# return besti, bestj, bestsize
391
392 def get_matching_blocks(self):
393 if self.matching_blocks is not None:
394 return self.matching_blocks
395 self.matching_blocks = []
396 la, lb = len(self.a), len(self.b)
397 self.__helper(0, la, 0, lb, self.matching_blocks)
398 self.matching_blocks.append( (la, lb, 0) )
399 if TRACE:
400 print '*** matching blocks', self.matching_blocks
401 return self.matching_blocks
402
403 # builds list of matching blocks covering a[alo:ahi] and
404 # b[blo:bhi], appending them in increasing order to answer
405
406 def __helper(self, alo, ahi, blo, bhi, answer):
407 i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
408 # a[alo:i] vs b[blo:j] unknown
409 # a[i:i+k] same as b[j:j+k]
410 # a[i+k:ahi] vs b[j+k:bhi] unknown
411 if k:
412 if alo < i and blo < j:
413 self.__helper(alo, i, blo, j, answer)
414 answer.append( x )
415 if i+k < ahi and j+k < bhi:
416 self.__helper(i+k, ahi, j+k, bhi, answer)
417
418 def ratio(self):
419 """Return a measure of the sequences' similarity (float in [0,1]).
420
421 Where T is the total number of elements in both sequences, and
422 M is the number of matches, this is 2*M / T.
423 Note that this is 1 if the sequences are identical, and 0 if
424 they have nothing in common.
425 """
426
427 matches = reduce(lambda sum, triple: sum + triple[-1],
428 self.get_matching_blocks(), 0)
429 return 2.0 * matches / (len(self.a) + len(self.b))
430
431 def quick_ratio(self):
432 """Return an upper bound on ratio() relatively quickly."""
433 # viewing a and b as multisets, set matches to the cardinality
434 # of their intersection; this counts the number of matches
435 # without regard to order, so is clearly an upper bound
436 if self.fullbcount is None:
437 self.fullbcount = fullbcount = {}
438 for elt in self.b:
439 fullbcount[elt] = fullbcount.get(elt, 0) + 1
440 fullbcount = self.fullbcount
441 # avail[x] is the number of times x appears in 'b' less the
442 # number of times we've seen it in 'a' so far ... kinda
443 avail = {}
444 availhas, matches = avail.has_key, 0
445 for elt in self.a:
446 if availhas(elt):
447 numb = avail[elt]
448 else:
449 numb = fullbcount.get(elt, 0)
450 avail[elt] = numb - 1
451 if numb > 0:
452 matches = matches + 1
453 return 2.0 * matches / (len(self.a) + len(self.b))
454
455 def real_quick_ratio(self):
456 """Return an upper bound on ratio() very quickly"""
457 la, lb = len(self.a), len(self.b)
458 # can't have more matches than the number of elements in the
459 # shorter sequence
460 return 2.0 * min(la, lb) / (la + lb)
461
462 def get_opcodes(self):
463 if self.opcodes is not None:
464 return self.opcodes
465 i = j = 0
466 self.opcodes = answer = []
467 for ai, bj, size in self.get_matching_blocks():
468 # invariant: we've pumped out correct diffs to change
469 # a[:i] into b[:j], and the next matching block is
470 # a[ai:ai+size] == b[bj:bj+size]. So we need to pump
471 # out a diff to change a[i:ai] into b[j:bj], pump out
472 # the matching block, and move (i,j) beyond the match
473 tag = ''
474 if i < ai and j < bj:
475 tag = 'replace'
476 elif i < ai:
477 tag = 'delete'
478 elif j < bj:
479 tag = 'insert'
480 if tag:
481 answer.append( (tag, i, ai, j, bj) )
482 i, j = ai+size, bj+size
483 # the list of matching blocks is terminated by a
484 # sentinel with size 0
485 if size:
486 answer.append( ('equal', ai, i, bj, j) )
487 return answer
488
489# meant for dumping lines
490def dump(tag, x, lo, hi):
491 for i in xrange(lo, hi):
492 print tag, x[i],
493
494# figure out which mark to stick under characters in lines that
495# have changed (blank = same, - = deleted, + = inserted, ^ = replaced)
496_combine = { ' ': ' ',
497 '. ': '-',
498 ' .': '+',
499 '..': '^' }
500
501def plain_replace(a, alo, ahi, b, blo, bhi):
502 assert alo < ahi and blo < bhi
503 # dump the shorter block first -- reduces the burden on short-term
504 # memory if the blocks are of very different sizes
505 if bhi - blo < ahi - alo:
506 dump('+', b, blo, bhi)
507 dump('-', a, alo, ahi)
508 else:
509 dump('-', a, alo, ahi)
510 dump('+', b, blo, bhi)
511
512# When replacing one block of lines with another, this guy searches
513# the blocks for *similar* lines; the best-matching pair (if any) is
514# used as a synch point, and intraline difference marking is done on
515# the similar pair. Lots of work, but often worth it.
516
517def fancy_replace(a, alo, ahi, b, blo, bhi):
518 if TRACE:
519 print '*** fancy_replace', alo, ahi, blo, bhi
520 dump('>', a, alo, ahi)
521 dump('<', b, blo, bhi)
522
523 # don't synch up unless the lines have a similarity score of at
524 # least cutoff; best_ratio tracks the best score seen so far
525 best_ratio, cutoff = 0.74, 0.75
526 cruncher = SequenceMatcher(IS_CHARACTER_JUNK)
527 eqi, eqj = None, None # 1st indices of equal lines (if any)
528
529 # search for the pair that matches best without being identical
530 # (identical lines must be junk lines, & we don't want to synch up
531 # on junk -- unless we have to)
532 for j in xrange(blo, bhi):
533 bj = b[j]
534 cruncher.set_seq2(bj)
535 for i in xrange(alo, ahi):
536 ai = a[i]
537 if ai == bj:
538 if eqi is None:
539 eqi, eqj = i, j
540 continue
541 cruncher.set_seq1(ai)
542 # computing similarity is expensive, so use the quick
543 # upper bounds first -- have seen this speed up messy
544 # compares by a factor of 3.
545 # note that ratio() is only expensive to compute the first
546 # time it's called on a sequence pair; the expensive part
547 # of the computation is cached by cruncher
548 if cruncher.real_quick_ratio() > best_ratio and \
549 cruncher.quick_ratio() > best_ratio and \
550 cruncher.ratio() > best_ratio:
551 best_ratio, best_i, best_j = cruncher.ratio(), i, j
552 if best_ratio < cutoff:
553 # no non-identical "pretty close" pair
554 if eqi is None:
555 # no identical pair either -- treat it as a straight replace
556 plain_replace(a, alo, ahi, b, blo, bhi)
557 return
558 # no close pair, but an identical pair -- synch up on that
559 best_i, best_j, best_ratio = eqi, eqj, 1.0
560 else:
561 # there's a close pair, so forget the identical pair (if any)
562 eqi = None
563
564 # a[best_i] very similar to b[best_j]; eqi is None iff they're not
565 # identical
566 if TRACE:
567 print '*** best_ratio', best_ratio, best_i, best_j
568 dump('>', a, best_i, best_i+1)
569 dump('<', b, best_j, best_j+1)
570
571 # pump out diffs from before the synch point
572 fancy_helper(a, alo, best_i, b, blo, best_j)
573
574 # do intraline marking on the synch pair
575 aelt, belt = a[best_i], b[best_j]
576 if eqi is None:
577 # pump out a '-', '+', '?' triple for the synched lines;
578 atags = btags = ""
579 cruncher.set_seqs(aelt, belt)
580 for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
581 la, lb = ai2 - ai1, bj2 - bj1
582 if tag == 'replace':
583 atags = atags + '.' * la
584 btags = btags + '.' * lb
585 elif tag == 'delete':
586 atags = atags + '.' * la
587 elif tag == 'insert':
588 btags = btags + '.' * lb
589 elif tag == 'equal':
590 atags = atags + ' ' * la
591 btags = btags + ' ' * lb
592 else:
593 raise ValueError, 'unknown tag ' + `tag`
594 la, lb = len(atags), len(btags)
595 if la < lb:
596 atags = atags + ' ' * (lb - la)
597 elif lb < la:
598 btags = btags + ' ' * (la - lb)
599 combined = map(lambda x,y: _combine[x+y], atags, btags)
600 print '-', aelt, '+', belt, '?', \
601 string.rstrip(string.join(combined, ''))
602 else:
603 # the synch pair is identical
604 print ' ', aelt,
605
606 # pump out diffs from after the synch point
607 fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)
608
609def fancy_helper(a, alo, ahi, b, blo, bhi):
610 if alo < ahi:
611 if blo < bhi:
612 fancy_replace(a, alo, ahi, b, blo, bhi)
613 else:
614 dump('-', a, alo, ahi)
615 elif blo < bhi:
616 dump('+', b, blo, bhi)
617
618# open a file & return the file object; gripe and return 0 if it
619# couldn't be opened
620def fopen(fname):
621 try:
622 return open(fname, 'r')
623 except IOError, detail:
624 print "couldn't open " + fname + ": " + `detail`
625 return 0
626
627# open two files & spray the diff to stdout; return false iff a problem
628def fcompare(f1name, f2name):
629 f1 = fopen(f1name)
630 f2 = fopen(f2name)
631 if not f1 or not f2:
632 return 0
633
634 a = f1.readlines(); f1.close()
635 b = f2.readlines(); f2.close()
636
637 cruncher = SequenceMatcher(IS_LINE_JUNK, a, b)
638 for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
639 if tag == 'replace':
640 fancy_replace(a, alo, ahi, b, blo, bhi)
641 elif tag == 'delete':
642 dump('-', a, alo, ahi)
643 elif tag == 'insert':
644 dump('+', b, blo, bhi)
645 elif tag == 'equal':
646 dump(' ', a, alo, ahi)
647 else:
648 raise ValueError, 'unknown tag ' + `tag`
649
650 return 1
651
652# get file names from argv & compare; return false iff a problem
653def main():
654 from sys import argv
655 if len(argv) != 3:
656 print 'need 2 args'
657 return 0
658 [f1name, f2name] = argv[1:3]
659 print '-:', f1name
660 print '+:', f2name
661 return fcompare(f1name, f2name)
662
663if __name__ == '__main__':
664 if 1:
665 main()
666 else:
667 import profile, pstats
668 statf = "ndiff.pro"
669 profile.run("main()", statf)
670 stats = pstats.Stats(statf)
671 stats.strip_dirs().sort_stats('time').print_stats()
672