blob: eaacb2976d620e1366fc4c7b36db67955e5ce15b [file] [log] [blame]
Guido van Rossum63b08ac2000-06-29 14:13:28 +00001"""Utilities for comparing files and directories.
Guido van Rossum2d726871999-10-26 14:02:01 +00002
Guido van Rossum63b08ac2000-06-29 14:13:28 +00003Classes:
4 dircmp
5
6Functions:
Andrew M. Kuchling83e879d2003-02-06 19:38:45 +00007 cmp(f1, f2, shallow=1) -> int
Guido van Rossum63b08ac2000-06-29 14:13:28 +00008 cmpfiles(a, b, common) -> ([], [], [])
9
10"""
11
12import os
13import stat
Andrew M. Kuchling83e879d2003-02-06 19:38:45 +000014import warnings
Guido van Rossum2d726871999-10-26 14:02:01 +000015
Skip Montanaroeccd02a2001-01-20 23:34:12 +000016__all__ = ["cmp","dircmp","cmpfiles"]
17
Guido van Rossum2d726871999-10-26 14:02:01 +000018_cache = {}
19BUFSIZE=8*1024
20
Andrew M. Kuchling83e879d2003-02-06 19:38:45 +000021def cmp(f1, f2, shallow=1, use_statcache=None):
Guido van Rossum63b08ac2000-06-29 14:13:28 +000022 """Compare two files.
Guido van Rossum2d726871999-10-26 14:02:01 +000023
Guido van Rossum63b08ac2000-06-29 14:13:28 +000024 Arguments:
Guido van Rossum2d726871999-10-26 14:02:01 +000025
Guido van Rossum63b08ac2000-06-29 14:13:28 +000026 f1 -- First file name
Guido van Rossum2d726871999-10-26 14:02:01 +000027
Guido van Rossum63b08ac2000-06-29 14:13:28 +000028 f2 -- Second file name
Guido van Rossum2d726871999-10-26 14:02:01 +000029
Guido van Rossum63b08ac2000-06-29 14:13:28 +000030 shallow -- Just check stat signature (do not read the files).
31 defaults to 1.
Guido van Rossum2d726871999-10-26 14:02:01 +000032
Andrew M. Kuchling8eb40442003-02-06 17:50:01 +000033 use_statcache -- obsolete argument.
Guido van Rossum2d726871999-10-26 14:02:01 +000034
Guido van Rossum63b08ac2000-06-29 14:13:28 +000035 Return value:
Guido van Rossum2d726871999-10-26 14:02:01 +000036
Tim Petersbc0e9102002-04-04 22:55:58 +000037 True if the files are the same, False otherwise.
Guido van Rossum2d726871999-10-26 14:02:01 +000038
Guido van Rossum63b08ac2000-06-29 14:13:28 +000039 This function uses a cache for past comparisons and the results,
40 with a cache invalidation mechanism relying on stale signatures.
Guido van Rossum2d726871999-10-26 14:02:01 +000041
Guido van Rossum63b08ac2000-06-29 14:13:28 +000042 """
Andrew M. Kuchling83e879d2003-02-06 19:38:45 +000043 if use_statcache is not None:
44 warnings.warn("use_statcache argument is deprecated",
45 DeprecationWarning)
46
Andrew M. Kuchling8eb40442003-02-06 17:50:01 +000047 s1 = _sig(os.stat(f1))
48 s2 = _sig(os.stat(f2))
Guido van Rossum63b08ac2000-06-29 14:13:28 +000049 if s1[0] != stat.S_IFREG or s2[0] != stat.S_IFREG:
Tim Petersbc0e9102002-04-04 22:55:58 +000050 return False
Guido van Rossum63b08ac2000-06-29 14:13:28 +000051 if shallow and s1 == s2:
Tim Petersbc0e9102002-04-04 22:55:58 +000052 return True
Guido van Rossum63b08ac2000-06-29 14:13:28 +000053 if s1[1] != s2[1]:
Tim Petersbc0e9102002-04-04 22:55:58 +000054 return False
Guido van Rossum2d726871999-10-26 14:02:01 +000055
Guido van Rossum63b08ac2000-06-29 14:13:28 +000056 result = _cache.get((f1, f2))
57 if result and (s1, s2) == result[:2]:
58 return result[2]
59 outcome = _do_cmp(f1, f2)
60 _cache[f1, f2] = s1, s2, outcome
61 return outcome
Guido van Rossum2d726871999-10-26 14:02:01 +000062
63def _sig(st):
Raymond Hettinger32200ae2002-06-01 19:51:15 +000064 return (stat.S_IFMT(st.st_mode),
65 st.st_size,
66 st.st_mtime)
Guido van Rossum2d726871999-10-26 14:02:01 +000067
68def _do_cmp(f1, f2):
Guido van Rossum63b08ac2000-06-29 14:13:28 +000069 bufsize = BUFSIZE
70 fp1 = open(f1, 'rb')
71 fp2 = open(f2, 'rb')
72 while 1:
73 b1 = fp1.read(bufsize)
74 b2 = fp2.read(bufsize)
75 if b1 != b2:
76 return 0
77 if not b1:
78 return 1
79
80# Directory comparison class.
81#
82class dircmp:
83 """A class that manages the comparison of 2 directories.
84
85 dircmp(a,b,ignore=None,hide=None)
86 A and B are directories.
87 IGNORE is a list of names to ignore,
88 defaults to ['RCS', 'CVS', 'tags'].
89 HIDE is a list of names to hide,
90 defaults to [os.curdir, os.pardir].
91
92 High level usage:
93 x = dircmp(dir1, dir2)
94 x.report() -> prints a report on the differences between dir1 and dir2
95 or
96 x.report_partial_closure() -> prints report on differences between dir1
97 and dir2, and reports on common immediate subdirectories.
98 x.report_full_closure() -> like report_partial_closure,
99 but fully recursive.
100
101 Attributes:
102 left_list, right_list: The files in dir1 and dir2,
103 filtered by hide and ignore.
104 common: a list of names in both dir1 and dir2.
105 left_only, right_only: names only in dir1, dir2.
106 common_dirs: subdirectories in both dir1 and dir2.
107 common_files: files in both dir1 and dir2.
108 common_funny: names in both dir1 and dir2 where the type differs between
109 dir1 and dir2, or the name is not stat-able.
110 same_files: list of identical files.
111 diff_files: list of filenames which differ.
112 funny_files: list of files which could not be compared.
113 subdirs: a dictionary of dircmp objects, keyed by names in common_dirs.
114 """
115
116 def __init__(self, a, b, ignore=None, hide=None): # Initialize
117 self.left = a
118 self.right = b
119 if hide is None:
120 self.hide = [os.curdir, os.pardir] # Names never to be shown
121 else:
122 self.hide = hide
123 if ignore is None:
124 self.ignore = ['RCS', 'CVS', 'tags'] # Names ignored in comparison
125 else:
126 self.ignore = ignore
127
128 def phase0(self): # Compare everything except common subdirectories
129 self.left_list = _filter(os.listdir(self.left),
130 self.hide+self.ignore)
131 self.right_list = _filter(os.listdir(self.right),
132 self.hide+self.ignore)
133 self.left_list.sort()
134 self.right_list.sort()
135
136 __p4_attrs = ('subdirs',)
137 __p3_attrs = ('same_files', 'diff_files', 'funny_files')
138 __p2_attrs = ('common_dirs', 'common_files', 'common_funny')
139 __p1_attrs = ('common', 'left_only', 'right_only')
140 __p0_attrs = ('left_list', 'right_list')
141
142 def __getattr__(self, attr):
143 if attr in self.__p4_attrs:
144 self.phase4()
145 elif attr in self.__p3_attrs:
146 self.phase3()
147 elif attr in self.__p2_attrs:
148 self.phase2()
149 elif attr in self.__p1_attrs:
150 self.phase1()
151 elif attr in self.__p0_attrs:
152 self.phase0()
153 else:
154 raise AttributeError, attr
155 return getattr(self, attr)
156
157 def phase1(self): # Compute common names
158 a_only, b_only = [], []
159 common = {}
160 b = {}
161 for fnm in self.right_list:
162 b[fnm] = 1
163 for x in self.left_list:
164 if b.get(x, 0):
165 common[x] = 1
166 else:
167 a_only.append(x)
168 for x in self.right_list:
169 if common.get(x, 0):
170 pass
171 else:
172 b_only.append(x)
173 self.common = common.keys()
174 self.left_only = a_only
175 self.right_only = b_only
176
177 def phase2(self): # Distinguish files, directories, funnies
178 self.common_dirs = []
179 self.common_files = []
180 self.common_funny = []
181
182 for x in self.common:
183 a_path = os.path.join(self.left, x)
184 b_path = os.path.join(self.right, x)
185
186 ok = 1
187 try:
Andrew M. Kuchling8eb40442003-02-06 17:50:01 +0000188 a_stat = os.stat(a_path)
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000189 except os.error, why:
190 # print 'Can\'t stat', a_path, ':', why[1]
191 ok = 0
192 try:
Andrew M. Kuchling8eb40442003-02-06 17:50:01 +0000193 b_stat = os.stat(b_path)
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000194 except os.error, why:
195 # print 'Can\'t stat', b_path, ':', why[1]
196 ok = 0
197
198 if ok:
Raymond Hettinger32200ae2002-06-01 19:51:15 +0000199 a_type = stat.S_IFMT(a_stat.st_mode)
200 b_type = stat.S_IFMT(b_stat.st_mode)
Fred Drake8152d322000-12-12 23:20:45 +0000201 if a_type != b_type:
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000202 self.common_funny.append(x)
203 elif stat.S_ISDIR(a_type):
204 self.common_dirs.append(x)
205 elif stat.S_ISREG(a_type):
206 self.common_files.append(x)
207 else:
208 self.common_funny.append(x)
209 else:
210 self.common_funny.append(x)
211
212 def phase3(self): # Find out differences between common files
213 xx = cmpfiles(self.left, self.right, self.common_files)
214 self.same_files, self.diff_files, self.funny_files = xx
215
216 def phase4(self): # Find out differences between common subdirectories
217 # A new dircmp object is created for each common subdirectory,
218 # these are stored in a dictionary indexed by filename.
219 # The hide and ignore properties are inherited from the parent
220 self.subdirs = {}
221 for x in self.common_dirs:
222 a_x = os.path.join(self.left, x)
223 b_x = os.path.join(self.right, x)
224 self.subdirs[x] = dircmp(a_x, b_x, self.ignore, self.hide)
225
226 def phase4_closure(self): # Recursively call phase4() on subdirectories
227 self.phase4()
Raymond Hettingere0d49722002-06-02 18:55:56 +0000228 for sd in self.subdirs.itervalues():
229 sd.phase4_closure()
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000230
231 def report(self): # Print a report on the differences between a and b
232 # Output format is purposely lousy
233 print 'diff', self.left, self.right
234 if self.left_only:
235 self.left_only.sort()
236 print 'Only in', self.left, ':', self.left_only
237 if self.right_only:
238 self.right_only.sort()
239 print 'Only in', self.right, ':', self.right_only
240 if self.same_files:
241 self.same_files.sort()
242 print 'Identical files :', self.same_files
243 if self.diff_files:
244 self.diff_files.sort()
245 print 'Differing files :', self.diff_files
246 if self.funny_files:
247 self.funny_files.sort()
248 print 'Trouble with common files :', self.funny_files
249 if self.common_dirs:
250 self.common_dirs.sort()
251 print 'Common subdirectories :', self.common_dirs
252 if self.common_funny:
253 self.common_funny.sort()
254 print 'Common funny cases :', self.common_funny
255
256 def report_partial_closure(self): # Print reports on self and on subdirs
257 self.report()
Raymond Hettingere0d49722002-06-02 18:55:56 +0000258 for sd in self.subdirs.itervalues():
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000259 print
Raymond Hettingere0d49722002-06-02 18:55:56 +0000260 sd.report()
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000261
262 def report_full_closure(self): # Report on self and subdirs recursively
263 self.report()
Raymond Hettingere0d49722002-06-02 18:55:56 +0000264 for sd in self.subdirs.itervalues():
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000265 print
Raymond Hettingere0d49722002-06-02 18:55:56 +0000266 sd.report_full_closure()
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000267
268
Andrew M. Kuchling83e879d2003-02-06 19:38:45 +0000269def cmpfiles(a, b, common, shallow=1, use_statcache=None):
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000270 """Compare common files in two directories.
271
Fred Drake2b0d98b2000-07-03 08:18:47 +0000272 a, b -- directory names
273 common -- list of file names found in both directories
274 shallow -- if true, do comparison based solely on stat() information
Andrew M. Kuchling8eb40442003-02-06 17:50:01 +0000275 use_statcache -- obsolete argument
Fred Drake2b0d98b2000-07-03 08:18:47 +0000276
277 Returns a tuple of three lists:
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000278 files that compare equal
279 files that are different
Fred Drake2b0d98b2000-07-03 08:18:47 +0000280 filenames that aren't regular files.
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000281
Fred Drake2b0d98b2000-07-03 08:18:47 +0000282 """
Andrew M. Kuchling83e879d2003-02-06 19:38:45 +0000283 if use_statcache is not None:
284 warnings.warn("use_statcache argument is deprecated",
285 DeprecationWarning)
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000286 res = ([], [], [])
287 for x in common:
Fred Drake2b0d98b2000-07-03 08:18:47 +0000288 ax = os.path.join(a, x)
289 bx = os.path.join(b, x)
Andrew M. Kuchling8eb40442003-02-06 17:50:01 +0000290 res[_cmp(ax, bx, shallow)].append(x)
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000291 return res
292
293
294# Compare two files.
295# Return:
Tim Peters88869f92001-01-14 23:36:06 +0000296# 0 for equal
297# 1 for different
298# 2 for funny cases (can't stat, etc.)
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000299#
Andrew M. Kuchling8eb40442003-02-06 17:50:01 +0000300def _cmp(a, b, sh):
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000301 try:
Andrew M. Kuchling8eb40442003-02-06 17:50:01 +0000302 return not abs(cmp(a, b, sh))
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000303 except os.error:
304 return 2
305
306
307# Return a copy with items that occur in skip removed.
308#
309def _filter(list, skip):
310 result = []
311 for item in list:
312 if item not in skip: result.append(item)
313 return result
314
315
316# Demonstration and testing.
317#
318def demo():
319 import sys
320 import getopt
321 options, args = getopt.getopt(sys.argv[1:], 'r')
Fred Drake8152d322000-12-12 23:20:45 +0000322 if len(args) != 2:
Andrew M. Kuchling83e879d2003-02-06 19:38:45 +0000323 raise getopt.GetoptError('need exactly two args', None)
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000324 dd = dircmp(args[0], args[1])
325 if ('-r', '') in options:
326 dd.report_full_closure()
327 else:
328 dd.report()
329
330if __name__ == '__main__':
331 demo()