blob: 03c2ea3f5b1bcddb377adb6216ae3224d101c504 [file] [log] [blame]
Guido van Rossum63b08ac2000-06-29 14:13:28 +00001"""Utilities for comparing files and directories.
Guido van Rossum2d726871999-10-26 14:02:01 +00002
Guido van Rossum63b08ac2000-06-29 14:13:28 +00003Classes:
4 dircmp
5
6Functions:
7 cmp(f1, f2, shallow=1, use_statcache=0) -> int
8 cmpfiles(a, b, common) -> ([], [], [])
9
10"""
11
12import os
13import stat
14import statcache
Guido van Rossum2d726871999-10-26 14:02:01 +000015
Skip Montanaroeccd02a2001-01-20 23:34:12 +000016__all__ = ["cmp","dircmp","cmpfiles"]
17
Guido van Rossum2d726871999-10-26 14:02:01 +000018_cache = {}
19BUFSIZE=8*1024
20
Fred Drake2b0d98b2000-07-03 08:18:47 +000021def cmp(f1, f2, shallow=1, use_statcache=0):
Guido van Rossum63b08ac2000-06-29 14:13:28 +000022 """Compare two files.
Guido van Rossum2d726871999-10-26 14:02:01 +000023
Guido van Rossum63b08ac2000-06-29 14:13:28 +000024 Arguments:
Guido van Rossum2d726871999-10-26 14:02:01 +000025
Guido van Rossum63b08ac2000-06-29 14:13:28 +000026 f1 -- First file name
Guido van Rossum2d726871999-10-26 14:02:01 +000027
Guido van Rossum63b08ac2000-06-29 14:13:28 +000028 f2 -- Second file name
Guido van Rossum2d726871999-10-26 14:02:01 +000029
Guido van Rossum63b08ac2000-06-29 14:13:28 +000030 shallow -- Just check stat signature (do not read the files).
31 defaults to 1.
Guido van Rossum2d726871999-10-26 14:02:01 +000032
Guido van Rossum63b08ac2000-06-29 14:13:28 +000033 use_statcache -- Do not stat() each file directly: go through
34 the statcache module for more efficiency.
Guido van Rossum2d726871999-10-26 14:02:01 +000035
Guido van Rossum63b08ac2000-06-29 14:13:28 +000036 Return value:
Guido van Rossum2d726871999-10-26 14:02:01 +000037
Tim Petersbc0e9102002-04-04 22:55:58 +000038 True if the files are the same, False otherwise.
Guido van Rossum2d726871999-10-26 14:02:01 +000039
Guido van Rossum63b08ac2000-06-29 14:13:28 +000040 This function uses a cache for past comparisons and the results,
41 with a cache invalidation mechanism relying on stale signatures.
42 Of course, if 'use_statcache' is true, this mechanism is defeated,
43 and the cache will never grow stale.
Guido van Rossum2d726871999-10-26 14:02:01 +000044
Guido van Rossum63b08ac2000-06-29 14:13:28 +000045 """
46 if use_statcache:
47 stat_function = statcache.stat
48 else:
49 stat_function = os.stat
50 s1 = _sig(stat_function(f1))
51 s2 = _sig(stat_function(f2))
52 if s1[0] != stat.S_IFREG or s2[0] != stat.S_IFREG:
Tim Petersbc0e9102002-04-04 22:55:58 +000053 return False
Guido van Rossum63b08ac2000-06-29 14:13:28 +000054 if shallow and s1 == s2:
Tim Petersbc0e9102002-04-04 22:55:58 +000055 return True
Guido van Rossum63b08ac2000-06-29 14:13:28 +000056 if s1[1] != s2[1]:
Tim Petersbc0e9102002-04-04 22:55:58 +000057 return False
Guido van Rossum2d726871999-10-26 14:02:01 +000058
Guido van Rossum63b08ac2000-06-29 14:13:28 +000059 result = _cache.get((f1, f2))
60 if result and (s1, s2) == result[:2]:
61 return result[2]
62 outcome = _do_cmp(f1, f2)
63 _cache[f1, f2] = s1, s2, outcome
64 return outcome
Guido van Rossum2d726871999-10-26 14:02:01 +000065
66def _sig(st):
Raymond Hettinger32200ae2002-06-01 19:51:15 +000067 return (stat.S_IFMT(st.st_mode),
68 st.st_size,
69 st.st_mtime)
Guido van Rossum2d726871999-10-26 14:02:01 +000070
71def _do_cmp(f1, f2):
Guido van Rossum63b08ac2000-06-29 14:13:28 +000072 bufsize = BUFSIZE
73 fp1 = open(f1, 'rb')
74 fp2 = open(f2, 'rb')
75 while 1:
76 b1 = fp1.read(bufsize)
77 b2 = fp2.read(bufsize)
78 if b1 != b2:
79 return 0
80 if not b1:
81 return 1
82
83# Directory comparison class.
84#
85class dircmp:
86 """A class that manages the comparison of 2 directories.
87
88 dircmp(a,b,ignore=None,hide=None)
89 A and B are directories.
90 IGNORE is a list of names to ignore,
91 defaults to ['RCS', 'CVS', 'tags'].
92 HIDE is a list of names to hide,
93 defaults to [os.curdir, os.pardir].
94
95 High level usage:
96 x = dircmp(dir1, dir2)
97 x.report() -> prints a report on the differences between dir1 and dir2
98 or
99 x.report_partial_closure() -> prints report on differences between dir1
100 and dir2, and reports on common immediate subdirectories.
101 x.report_full_closure() -> like report_partial_closure,
102 but fully recursive.
103
104 Attributes:
105 left_list, right_list: The files in dir1 and dir2,
106 filtered by hide and ignore.
107 common: a list of names in both dir1 and dir2.
108 left_only, right_only: names only in dir1, dir2.
109 common_dirs: subdirectories in both dir1 and dir2.
110 common_files: files in both dir1 and dir2.
111 common_funny: names in both dir1 and dir2 where the type differs between
112 dir1 and dir2, or the name is not stat-able.
113 same_files: list of identical files.
114 diff_files: list of filenames which differ.
115 funny_files: list of files which could not be compared.
116 subdirs: a dictionary of dircmp objects, keyed by names in common_dirs.
117 """
118
119 def __init__(self, a, b, ignore=None, hide=None): # Initialize
120 self.left = a
121 self.right = b
122 if hide is None:
123 self.hide = [os.curdir, os.pardir] # Names never to be shown
124 else:
125 self.hide = hide
126 if ignore is None:
127 self.ignore = ['RCS', 'CVS', 'tags'] # Names ignored in comparison
128 else:
129 self.ignore = ignore
130
131 def phase0(self): # Compare everything except common subdirectories
132 self.left_list = _filter(os.listdir(self.left),
133 self.hide+self.ignore)
134 self.right_list = _filter(os.listdir(self.right),
135 self.hide+self.ignore)
136 self.left_list.sort()
137 self.right_list.sort()
138
139 __p4_attrs = ('subdirs',)
140 __p3_attrs = ('same_files', 'diff_files', 'funny_files')
141 __p2_attrs = ('common_dirs', 'common_files', 'common_funny')
142 __p1_attrs = ('common', 'left_only', 'right_only')
143 __p0_attrs = ('left_list', 'right_list')
144
145 def __getattr__(self, attr):
146 if attr in self.__p4_attrs:
147 self.phase4()
148 elif attr in self.__p3_attrs:
149 self.phase3()
150 elif attr in self.__p2_attrs:
151 self.phase2()
152 elif attr in self.__p1_attrs:
153 self.phase1()
154 elif attr in self.__p0_attrs:
155 self.phase0()
156 else:
157 raise AttributeError, attr
158 return getattr(self, attr)
159
160 def phase1(self): # Compute common names
161 a_only, b_only = [], []
162 common = {}
163 b = {}
164 for fnm in self.right_list:
165 b[fnm] = 1
166 for x in self.left_list:
167 if b.get(x, 0):
168 common[x] = 1
169 else:
170 a_only.append(x)
171 for x in self.right_list:
172 if common.get(x, 0):
173 pass
174 else:
175 b_only.append(x)
176 self.common = common.keys()
177 self.left_only = a_only
178 self.right_only = b_only
179
180 def phase2(self): # Distinguish files, directories, funnies
181 self.common_dirs = []
182 self.common_files = []
183 self.common_funny = []
184
185 for x in self.common:
186 a_path = os.path.join(self.left, x)
187 b_path = os.path.join(self.right, x)
188
189 ok = 1
190 try:
191 a_stat = statcache.stat(a_path)
192 except os.error, why:
193 # print 'Can\'t stat', a_path, ':', why[1]
194 ok = 0
195 try:
196 b_stat = statcache.stat(b_path)
197 except os.error, why:
198 # print 'Can\'t stat', b_path, ':', why[1]
199 ok = 0
200
201 if ok:
Raymond Hettinger32200ae2002-06-01 19:51:15 +0000202 a_type = stat.S_IFMT(a_stat.st_mode)
203 b_type = stat.S_IFMT(b_stat.st_mode)
Fred Drake8152d322000-12-12 23:20:45 +0000204 if a_type != b_type:
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000205 self.common_funny.append(x)
206 elif stat.S_ISDIR(a_type):
207 self.common_dirs.append(x)
208 elif stat.S_ISREG(a_type):
209 self.common_files.append(x)
210 else:
211 self.common_funny.append(x)
212 else:
213 self.common_funny.append(x)
214
215 def phase3(self): # Find out differences between common files
216 xx = cmpfiles(self.left, self.right, self.common_files)
217 self.same_files, self.diff_files, self.funny_files = xx
218
219 def phase4(self): # Find out differences between common subdirectories
220 # A new dircmp object is created for each common subdirectory,
221 # these are stored in a dictionary indexed by filename.
222 # The hide and ignore properties are inherited from the parent
223 self.subdirs = {}
224 for x in self.common_dirs:
225 a_x = os.path.join(self.left, x)
226 b_x = os.path.join(self.right, x)
227 self.subdirs[x] = dircmp(a_x, b_x, self.ignore, self.hide)
228
229 def phase4_closure(self): # Recursively call phase4() on subdirectories
230 self.phase4()
Raymond Hettingere0d49722002-06-02 18:55:56 +0000231 for sd in self.subdirs.itervalues():
232 sd.phase4_closure()
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000233
234 def report(self): # Print a report on the differences between a and b
235 # Output format is purposely lousy
236 print 'diff', self.left, self.right
237 if self.left_only:
238 self.left_only.sort()
239 print 'Only in', self.left, ':', self.left_only
240 if self.right_only:
241 self.right_only.sort()
242 print 'Only in', self.right, ':', self.right_only
243 if self.same_files:
244 self.same_files.sort()
245 print 'Identical files :', self.same_files
246 if self.diff_files:
247 self.diff_files.sort()
248 print 'Differing files :', self.diff_files
249 if self.funny_files:
250 self.funny_files.sort()
251 print 'Trouble with common files :', self.funny_files
252 if self.common_dirs:
253 self.common_dirs.sort()
254 print 'Common subdirectories :', self.common_dirs
255 if self.common_funny:
256 self.common_funny.sort()
257 print 'Common funny cases :', self.common_funny
258
259 def report_partial_closure(self): # Print reports on self and on subdirs
260 self.report()
Raymond Hettingere0d49722002-06-02 18:55:56 +0000261 for sd in self.subdirs.itervalues():
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000262 print
Raymond Hettingere0d49722002-06-02 18:55:56 +0000263 sd.report()
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000264
265 def report_full_closure(self): # Report on self and subdirs recursively
266 self.report()
Raymond Hettingere0d49722002-06-02 18:55:56 +0000267 for sd in self.subdirs.itervalues():
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000268 print
Raymond Hettingere0d49722002-06-02 18:55:56 +0000269 sd.report_full_closure()
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000270
271
Fred Drake2b0d98b2000-07-03 08:18:47 +0000272def cmpfiles(a, b, common, shallow=1, use_statcache=0):
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000273 """Compare common files in two directories.
274
Fred Drake2b0d98b2000-07-03 08:18:47 +0000275 a, b -- directory names
276 common -- list of file names found in both directories
277 shallow -- if true, do comparison based solely on stat() information
278 use_statcache -- if true, use statcache.stat() instead of os.stat()
279
280 Returns a tuple of three lists:
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000281 files that compare equal
282 files that are different
Fred Drake2b0d98b2000-07-03 08:18:47 +0000283 filenames that aren't regular files.
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000284
Fred Drake2b0d98b2000-07-03 08:18:47 +0000285 """
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000286 res = ([], [], [])
287 for x in common:
Fred Drake2b0d98b2000-07-03 08:18:47 +0000288 ax = os.path.join(a, x)
289 bx = os.path.join(b, x)
290 res[_cmp(ax, bx, shallow, use_statcache)].append(x)
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000291 return res
292
293
294# Compare two files.
295# Return:
Tim Peters88869f92001-01-14 23:36:06 +0000296# 0 for equal
297# 1 for different
298# 2 for funny cases (can't stat, etc.)
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000299#
Moshe Zadkaafb17fc2000-12-03 20:48:07 +0000300def _cmp(a, b, sh, st):
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000301 try:
Moshe Zadkaafb17fc2000-12-03 20:48:07 +0000302 return not abs(cmp(a, b, sh, st))
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000303 except os.error:
304 return 2
305
306
307# Return a copy with items that occur in skip removed.
308#
309def _filter(list, skip):
310 result = []
311 for item in list:
312 if item not in skip: result.append(item)
313 return result
314
315
316# Demonstration and testing.
317#
318def demo():
319 import sys
320 import getopt
321 options, args = getopt.getopt(sys.argv[1:], 'r')
Fred Drake8152d322000-12-12 23:20:45 +0000322 if len(args) != 2:
323 raise getopt.error, 'need exactly two args'
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000324 dd = dircmp(args[0], args[1])
325 if ('-r', '') in options:
326 dd.report_full_closure()
327 else:
328 dd.report()
329
330if __name__ == '__main__':
331 demo()