blob: d59fa60d023282510963d5014f2cb9f6fc3cd5aa [file] [log] [blame]
Guido van Rossumc6360141990-10-13 19:23:40 +00001# Module 'cmpcache'
2#
3# Efficiently compare files, boolean outcome only (equal / not equal).
4#
5# Tricks (used in this order):
6# - Use the statcache module to avoid statting files more than once
7# - Files with identical type, size & mtime are assumed to be clones
8# - Files with different type or size cannot be identical
9# - We keep a cache of outcomes of earlier comparisons
10# - We don't fork a process to run 'cmp' but read the files ourselves
Guido van Rossumc6360141990-10-13 19:23:40 +000011
Guido van Rossum25d7caf1992-03-31 19:04:48 +000012import os
Guido van Rossum276123d1990-10-24 16:39:37 +000013from stat import *
Guido van Rossumc6360141990-10-13 19:23:40 +000014import statcache
15
16
17# The cache.
18#
19cache = {}
20
21
22# Compare two files, use the cache if possible.
Guido van Rossum25d7caf1992-03-31 19:04:48 +000023# May raise os.error if a stat or open of either fails.
Guido van Rossumc6360141990-10-13 19:23:40 +000024#
25def cmp(f1, f2):
26 # Return 1 for identical files, 0 for different.
27 # Raise exceptions if either file could not be statted, read, etc.
28 s1, s2 = sig(statcache.stat(f1)), sig(statcache.stat(f2))
Guido van Rossum276123d1990-10-24 16:39:37 +000029 if not S_ISREG(s1[0]) or not S_ISREG(s2[0]):
Guido van Rossumc6360141990-10-13 19:23:40 +000030 # Either is a not a plain file -- always report as different
31 return 0
Guido van Rossumbdfcfcc1992-01-01 19:35:13 +000032 if s1 == s2:
Guido van Rossumc6360141990-10-13 19:23:40 +000033 # type, size & mtime match -- report same
34 return 1
35 if s1[:2] <> s2[:2]: # Types or sizes differ, don't bother
36 # types or sizes differ -- report different
37 return 0
38 # same type and size -- look in the cache
39 key = f1 + ' ' + f2
40 if cache.has_key(key):
41 cs1, cs2, outcome = cache[key]
42 # cache hit
Guido van Rossumbdfcfcc1992-01-01 19:35:13 +000043 if s1 == cs1 and s2 == cs2:
Guido van Rossumc6360141990-10-13 19:23:40 +000044 # cached signatures match
45 return outcome
46 # stale cached signature(s)
47 # really compare
48 outcome = do_cmp(f1, f2)
49 cache[key] = s1, s2, outcome
50 return outcome
51
52# Return signature (i.e., type, size, mtime) from raw stat data.
53#
54def sig(st):
Guido van Rossum276123d1990-10-24 16:39:37 +000055 return S_IFMT(st[ST_MODE]), st[ST_SIZE], st[ST_MTIME]
Guido van Rossumc6360141990-10-13 19:23:40 +000056
57# Compare two files, really.
58#
59def do_cmp(f1, f2):
60 #print ' cmp', f1, f2 # XXX remove when debugged
61 bufsize = 8096 # Could be tuned
62 fp1 = open(f1, 'r')
63 fp2 = open(f2, 'r')
64 while 1:
65 b1 = fp1.read(bufsize)
66 b2 = fp2.read(bufsize)
67 if b1 <> b2: return 0
68 if not b1: return 1