Guido van Rossum | 4acc25b | 2000-02-02 15:10:15 +0000 | [diff] [blame] | 1 | """Efficiently compare files, boolean outcome only (equal / not equal). |
| 2 | |
| 3 | Tricks (used in this order): |
| 4 | - Use the statcache module to avoid statting files more than once |
| 5 | - Files with identical type, size & mtime are assumed to be clones |
| 6 | - Files with different type or size cannot be identical |
| 7 | - We keep a cache of outcomes of earlier comparisons |
| 8 | - We don't fork a process to run 'cmp' but read the files ourselves |
| 9 | """ |
Guido van Rossum | c636014 | 1990-10-13 19:23:40 +0000 | [diff] [blame] | 10 | |
Guido van Rossum | 25d7caf | 1992-03-31 19:04:48 +0000 | [diff] [blame] | 11 | import os |
Guido van Rossum | 276123d | 1990-10-24 16:39:37 +0000 | [diff] [blame] | 12 | from stat import * |
Guido van Rossum | c636014 | 1990-10-13 19:23:40 +0000 | [diff] [blame] | 13 | import statcache |
| 14 | |
| 15 | |
| 16 | # The cache. |
| 17 | # |
| 18 | cache = {} |
| 19 | |
| 20 | |
Guido van Rossum | 59834f1 | 1999-06-25 14:21:44 +0000 | [diff] [blame] | 21 | def cmp(f1, f2, shallow=1): |
Guido van Rossum | 4acc25b | 2000-02-02 15:10:15 +0000 | [diff] [blame] | 22 | """Compare two files, use the cache if possible. |
| 23 | May raise os.error if a stat or open of either fails. |
| 24 | Return 1 for identical files, 0 for different. |
| 25 | Raise exceptions if either file could not be statted, read, etc.""" |
| 26 | s1, s2 = sig(statcache.stat(f1)), sig(statcache.stat(f2)) |
| 27 | if not S_ISREG(s1[0]) or not S_ISREG(s2[0]): |
| 28 | # Either is a not a plain file -- always report as different |
| 29 | return 0 |
| 30 | if shallow and s1 == s2: |
| 31 | # type, size & mtime match -- report same |
| 32 | return 1 |
Fred Drake | 132dce2 | 2000-12-12 23:11:42 +0000 | [diff] [blame] | 33 | if s1[:2] != s2[:2]: # Types or sizes differ, don't bother |
Guido van Rossum | 4acc25b | 2000-02-02 15:10:15 +0000 | [diff] [blame] | 34 | # types or sizes differ -- report different |
| 35 | return 0 |
| 36 | # same type and size -- look in the cache |
| 37 | key = f1 + ' ' + f2 |
| 38 | if cache.has_key(key): |
| 39 | cs1, cs2, outcome = cache[key] |
| 40 | # cache hit |
| 41 | if s1 == cs1 and s2 == cs2: |
| 42 | # cached signatures match |
| 43 | return outcome |
| 44 | # stale cached signature(s) |
| 45 | # really compare |
| 46 | outcome = do_cmp(f1, f2) |
| 47 | cache[key] = s1, s2, outcome |
| 48 | return outcome |
Guido van Rossum | c636014 | 1990-10-13 19:23:40 +0000 | [diff] [blame] | 49 | |
Guido van Rossum | c636014 | 1990-10-13 19:23:40 +0000 | [diff] [blame] | 50 | def sig(st): |
Guido van Rossum | 4acc25b | 2000-02-02 15:10:15 +0000 | [diff] [blame] | 51 | """Return signature (i.e., type, size, mtime) from raw stat data.""" |
| 52 | return S_IFMT(st[ST_MODE]), st[ST_SIZE], st[ST_MTIME] |
Guido van Rossum | c636014 | 1990-10-13 19:23:40 +0000 | [diff] [blame] | 53 | |
Guido van Rossum | c636014 | 1990-10-13 19:23:40 +0000 | [diff] [blame] | 54 | def do_cmp(f1, f2): |
Guido van Rossum | 4acc25b | 2000-02-02 15:10:15 +0000 | [diff] [blame] | 55 | """Compare two files, really.""" |
| 56 | #print ' cmp', f1, f2 # XXX remove when debugged |
| 57 | bufsize = 8*1024 # Could be tuned |
| 58 | fp1 = open(f1, 'rb') |
| 59 | fp2 = open(f2, 'rb') |
| 60 | while 1: |
| 61 | b1 = fp1.read(bufsize) |
| 62 | b2 = fp2.read(bufsize) |
Fred Drake | 132dce2 | 2000-12-12 23:11:42 +0000 | [diff] [blame] | 63 | if b1 != b2: return 0 |
Guido van Rossum | 4acc25b | 2000-02-02 15:10:15 +0000 | [diff] [blame] | 64 | if not b1: return 1 |