blob: 982e487db903f7e7e087d5015394edee9282fb56 [file] [log] [blame]
Guido van Rossum63b08ac2000-06-29 14:13:28 +00001"""Utilities for comparing files and directories.
Guido van Rossum2d726871999-10-26 14:02:01 +00002
Guido van Rossum63b08ac2000-06-29 14:13:28 +00003Classes:
4 dircmp
5
6Functions:
7 cmp(f1, f2, shallow=1, use_statcache=0) -> int
8 cmpfiles(a, b, common) -> ([], [], [])
9
10"""
11
12import os
13import stat
Guido van Rossum2d726871999-10-26 14:02:01 +000014
Skip Montanaroeccd02a2001-01-20 23:34:12 +000015__all__ = ["cmp","dircmp","cmpfiles"]
16
Guido van Rossum2d726871999-10-26 14:02:01 +000017_cache = {}
18BUFSIZE=8*1024
19
Fred Drake2b0d98b2000-07-03 08:18:47 +000020def cmp(f1, f2, shallow=1, use_statcache=0):
Guido van Rossum63b08ac2000-06-29 14:13:28 +000021 """Compare two files.
Guido van Rossum2d726871999-10-26 14:02:01 +000022
Guido van Rossum63b08ac2000-06-29 14:13:28 +000023 Arguments:
Guido van Rossum2d726871999-10-26 14:02:01 +000024
Guido van Rossum63b08ac2000-06-29 14:13:28 +000025 f1 -- First file name
Guido van Rossum2d726871999-10-26 14:02:01 +000026
Guido van Rossum63b08ac2000-06-29 14:13:28 +000027 f2 -- Second file name
Guido van Rossum2d726871999-10-26 14:02:01 +000028
Guido van Rossum63b08ac2000-06-29 14:13:28 +000029 shallow -- Just check stat signature (do not read the files).
30 defaults to 1.
Guido van Rossum2d726871999-10-26 14:02:01 +000031
Andrew M. Kuchling8eb40442003-02-06 17:50:01 +000032 use_statcache -- obsolete argument.
Guido van Rossum2d726871999-10-26 14:02:01 +000033
Guido van Rossum63b08ac2000-06-29 14:13:28 +000034 Return value:
Guido van Rossum2d726871999-10-26 14:02:01 +000035
Tim Petersbc0e9102002-04-04 22:55:58 +000036 True if the files are the same, False otherwise.
Guido van Rossum2d726871999-10-26 14:02:01 +000037
Guido van Rossum63b08ac2000-06-29 14:13:28 +000038 This function uses a cache for past comparisons and the results,
39 with a cache invalidation mechanism relying on stale signatures.
Guido van Rossum2d726871999-10-26 14:02:01 +000040
Guido van Rossum63b08ac2000-06-29 14:13:28 +000041 """
Andrew M. Kuchling8eb40442003-02-06 17:50:01 +000042 s1 = _sig(os.stat(f1))
43 s2 = _sig(os.stat(f2))
Guido van Rossum63b08ac2000-06-29 14:13:28 +000044 if s1[0] != stat.S_IFREG or s2[0] != stat.S_IFREG:
Tim Petersbc0e9102002-04-04 22:55:58 +000045 return False
Guido van Rossum63b08ac2000-06-29 14:13:28 +000046 if shallow and s1 == s2:
Tim Petersbc0e9102002-04-04 22:55:58 +000047 return True
Guido van Rossum63b08ac2000-06-29 14:13:28 +000048 if s1[1] != s2[1]:
Tim Petersbc0e9102002-04-04 22:55:58 +000049 return False
Guido van Rossum2d726871999-10-26 14:02:01 +000050
Guido van Rossum63b08ac2000-06-29 14:13:28 +000051 result = _cache.get((f1, f2))
52 if result and (s1, s2) == result[:2]:
53 return result[2]
54 outcome = _do_cmp(f1, f2)
55 _cache[f1, f2] = s1, s2, outcome
56 return outcome
Guido van Rossum2d726871999-10-26 14:02:01 +000057
58def _sig(st):
Raymond Hettinger32200ae2002-06-01 19:51:15 +000059 return (stat.S_IFMT(st.st_mode),
60 st.st_size,
61 st.st_mtime)
Guido van Rossum2d726871999-10-26 14:02:01 +000062
63def _do_cmp(f1, f2):
Guido van Rossum63b08ac2000-06-29 14:13:28 +000064 bufsize = BUFSIZE
65 fp1 = open(f1, 'rb')
66 fp2 = open(f2, 'rb')
67 while 1:
68 b1 = fp1.read(bufsize)
69 b2 = fp2.read(bufsize)
70 if b1 != b2:
71 return 0
72 if not b1:
73 return 1
74
75# Directory comparison class.
76#
77class dircmp:
78 """A class that manages the comparison of 2 directories.
79
80 dircmp(a,b,ignore=None,hide=None)
81 A and B are directories.
82 IGNORE is a list of names to ignore,
83 defaults to ['RCS', 'CVS', 'tags'].
84 HIDE is a list of names to hide,
85 defaults to [os.curdir, os.pardir].
86
87 High level usage:
88 x = dircmp(dir1, dir2)
89 x.report() -> prints a report on the differences between dir1 and dir2
90 or
91 x.report_partial_closure() -> prints report on differences between dir1
92 and dir2, and reports on common immediate subdirectories.
93 x.report_full_closure() -> like report_partial_closure,
94 but fully recursive.
95
96 Attributes:
97 left_list, right_list: The files in dir1 and dir2,
98 filtered by hide and ignore.
99 common: a list of names in both dir1 and dir2.
100 left_only, right_only: names only in dir1, dir2.
101 common_dirs: subdirectories in both dir1 and dir2.
102 common_files: files in both dir1 and dir2.
103 common_funny: names in both dir1 and dir2 where the type differs between
104 dir1 and dir2, or the name is not stat-able.
105 same_files: list of identical files.
106 diff_files: list of filenames which differ.
107 funny_files: list of files which could not be compared.
108 subdirs: a dictionary of dircmp objects, keyed by names in common_dirs.
109 """
110
111 def __init__(self, a, b, ignore=None, hide=None): # Initialize
112 self.left = a
113 self.right = b
114 if hide is None:
115 self.hide = [os.curdir, os.pardir] # Names never to be shown
116 else:
117 self.hide = hide
118 if ignore is None:
119 self.ignore = ['RCS', 'CVS', 'tags'] # Names ignored in comparison
120 else:
121 self.ignore = ignore
122
123 def phase0(self): # Compare everything except common subdirectories
124 self.left_list = _filter(os.listdir(self.left),
125 self.hide+self.ignore)
126 self.right_list = _filter(os.listdir(self.right),
127 self.hide+self.ignore)
128 self.left_list.sort()
129 self.right_list.sort()
130
131 __p4_attrs = ('subdirs',)
132 __p3_attrs = ('same_files', 'diff_files', 'funny_files')
133 __p2_attrs = ('common_dirs', 'common_files', 'common_funny')
134 __p1_attrs = ('common', 'left_only', 'right_only')
135 __p0_attrs = ('left_list', 'right_list')
136
137 def __getattr__(self, attr):
138 if attr in self.__p4_attrs:
139 self.phase4()
140 elif attr in self.__p3_attrs:
141 self.phase3()
142 elif attr in self.__p2_attrs:
143 self.phase2()
144 elif attr in self.__p1_attrs:
145 self.phase1()
146 elif attr in self.__p0_attrs:
147 self.phase0()
148 else:
149 raise AttributeError, attr
150 return getattr(self, attr)
151
152 def phase1(self): # Compute common names
153 a_only, b_only = [], []
154 common = {}
155 b = {}
156 for fnm in self.right_list:
157 b[fnm] = 1
158 for x in self.left_list:
159 if b.get(x, 0):
160 common[x] = 1
161 else:
162 a_only.append(x)
163 for x in self.right_list:
164 if common.get(x, 0):
165 pass
166 else:
167 b_only.append(x)
168 self.common = common.keys()
169 self.left_only = a_only
170 self.right_only = b_only
171
172 def phase2(self): # Distinguish files, directories, funnies
173 self.common_dirs = []
174 self.common_files = []
175 self.common_funny = []
176
177 for x in self.common:
178 a_path = os.path.join(self.left, x)
179 b_path = os.path.join(self.right, x)
180
181 ok = 1
182 try:
Andrew M. Kuchling8eb40442003-02-06 17:50:01 +0000183 a_stat = os.stat(a_path)
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000184 except os.error, why:
185 # print 'Can\'t stat', a_path, ':', why[1]
186 ok = 0
187 try:
Andrew M. Kuchling8eb40442003-02-06 17:50:01 +0000188 b_stat = os.stat(b_path)
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000189 except os.error, why:
190 # print 'Can\'t stat', b_path, ':', why[1]
191 ok = 0
192
193 if ok:
Raymond Hettinger32200ae2002-06-01 19:51:15 +0000194 a_type = stat.S_IFMT(a_stat.st_mode)
195 b_type = stat.S_IFMT(b_stat.st_mode)
Fred Drake8152d322000-12-12 23:20:45 +0000196 if a_type != b_type:
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000197 self.common_funny.append(x)
198 elif stat.S_ISDIR(a_type):
199 self.common_dirs.append(x)
200 elif stat.S_ISREG(a_type):
201 self.common_files.append(x)
202 else:
203 self.common_funny.append(x)
204 else:
205 self.common_funny.append(x)
206
207 def phase3(self): # Find out differences between common files
208 xx = cmpfiles(self.left, self.right, self.common_files)
209 self.same_files, self.diff_files, self.funny_files = xx
210
211 def phase4(self): # Find out differences between common subdirectories
212 # A new dircmp object is created for each common subdirectory,
213 # these are stored in a dictionary indexed by filename.
214 # The hide and ignore properties are inherited from the parent
215 self.subdirs = {}
216 for x in self.common_dirs:
217 a_x = os.path.join(self.left, x)
218 b_x = os.path.join(self.right, x)
219 self.subdirs[x] = dircmp(a_x, b_x, self.ignore, self.hide)
220
221 def phase4_closure(self): # Recursively call phase4() on subdirectories
222 self.phase4()
Raymond Hettingere0d49722002-06-02 18:55:56 +0000223 for sd in self.subdirs.itervalues():
224 sd.phase4_closure()
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000225
226 def report(self): # Print a report on the differences between a and b
227 # Output format is purposely lousy
228 print 'diff', self.left, self.right
229 if self.left_only:
230 self.left_only.sort()
231 print 'Only in', self.left, ':', self.left_only
232 if self.right_only:
233 self.right_only.sort()
234 print 'Only in', self.right, ':', self.right_only
235 if self.same_files:
236 self.same_files.sort()
237 print 'Identical files :', self.same_files
238 if self.diff_files:
239 self.diff_files.sort()
240 print 'Differing files :', self.diff_files
241 if self.funny_files:
242 self.funny_files.sort()
243 print 'Trouble with common files :', self.funny_files
244 if self.common_dirs:
245 self.common_dirs.sort()
246 print 'Common subdirectories :', self.common_dirs
247 if self.common_funny:
248 self.common_funny.sort()
249 print 'Common funny cases :', self.common_funny
250
251 def report_partial_closure(self): # Print reports on self and on subdirs
252 self.report()
Raymond Hettingere0d49722002-06-02 18:55:56 +0000253 for sd in self.subdirs.itervalues():
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000254 print
Raymond Hettingere0d49722002-06-02 18:55:56 +0000255 sd.report()
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000256
257 def report_full_closure(self): # Report on self and subdirs recursively
258 self.report()
Raymond Hettingere0d49722002-06-02 18:55:56 +0000259 for sd in self.subdirs.itervalues():
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000260 print
Raymond Hettingere0d49722002-06-02 18:55:56 +0000261 sd.report_full_closure()
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000262
263
Fred Drake2b0d98b2000-07-03 08:18:47 +0000264def cmpfiles(a, b, common, shallow=1, use_statcache=0):
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000265 """Compare common files in two directories.
266
Fred Drake2b0d98b2000-07-03 08:18:47 +0000267 a, b -- directory names
268 common -- list of file names found in both directories
269 shallow -- if true, do comparison based solely on stat() information
Andrew M. Kuchling8eb40442003-02-06 17:50:01 +0000270 use_statcache -- obsolete argument
Fred Drake2b0d98b2000-07-03 08:18:47 +0000271
272 Returns a tuple of three lists:
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000273 files that compare equal
274 files that are different
Fred Drake2b0d98b2000-07-03 08:18:47 +0000275 filenames that aren't regular files.
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000276
Fred Drake2b0d98b2000-07-03 08:18:47 +0000277 """
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000278 res = ([], [], [])
279 for x in common:
Fred Drake2b0d98b2000-07-03 08:18:47 +0000280 ax = os.path.join(a, x)
281 bx = os.path.join(b, x)
Andrew M. Kuchling8eb40442003-02-06 17:50:01 +0000282 res[_cmp(ax, bx, shallow)].append(x)
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000283 return res
284
285
286# Compare two files.
287# Return:
Tim Peters88869f92001-01-14 23:36:06 +0000288# 0 for equal
289# 1 for different
290# 2 for funny cases (can't stat, etc.)
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000291#
Andrew M. Kuchling8eb40442003-02-06 17:50:01 +0000292def _cmp(a, b, sh):
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000293 try:
Andrew M. Kuchling8eb40442003-02-06 17:50:01 +0000294 return not abs(cmp(a, b, sh))
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000295 except os.error:
296 return 2
297
298
299# Return a copy with items that occur in skip removed.
300#
301def _filter(list, skip):
302 result = []
303 for item in list:
304 if item not in skip: result.append(item)
305 return result
306
307
308# Demonstration and testing.
309#
310def demo():
311 import sys
312 import getopt
313 options, args = getopt.getopt(sys.argv[1:], 'r')
Fred Drake8152d322000-12-12 23:20:45 +0000314 if len(args) != 2:
315 raise getopt.error, 'need exactly two args'
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000316 dd = dircmp(args[0], args[1])
317 if ('-r', '') in options:
318 dd.report_full_closure()
319 else:
320 dd.report()
321
322if __name__ == '__main__':
323 demo()