blob: ec6e1ffc3ec2c37fad04104e63fad6d9faf26d66 [file] [log] [blame]
Guido van Rossum63b08ac2000-06-29 14:13:28 +00001"""Utilities for comparing files and directories.
Guido van Rossum2d726871999-10-26 14:02:01 +00002
Guido van Rossum63b08ac2000-06-29 14:13:28 +00003Classes:
4 dircmp
5
6Functions:
7 cmp(f1, f2, shallow=1, use_statcache=0) -> int
8 cmpfiles(a, b, common) -> ([], [], [])
9
10"""
11
12import os
13import stat
14import statcache
Guido van Rossum2d726871999-10-26 14:02:01 +000015
16_cache = {}
17BUFSIZE=8*1024
18
Fred Drake2b0d98b2000-07-03 08:18:47 +000019def cmp(f1, f2, shallow=1, use_statcache=0):
Guido van Rossum63b08ac2000-06-29 14:13:28 +000020 """Compare two files.
Guido van Rossum2d726871999-10-26 14:02:01 +000021
Guido van Rossum63b08ac2000-06-29 14:13:28 +000022 Arguments:
Guido van Rossum2d726871999-10-26 14:02:01 +000023
Guido van Rossum63b08ac2000-06-29 14:13:28 +000024 f1 -- First file name
Guido van Rossum2d726871999-10-26 14:02:01 +000025
Guido van Rossum63b08ac2000-06-29 14:13:28 +000026 f2 -- Second file name
Guido van Rossum2d726871999-10-26 14:02:01 +000027
Guido van Rossum63b08ac2000-06-29 14:13:28 +000028 shallow -- Just check stat signature (do not read the files).
29 defaults to 1.
Guido van Rossum2d726871999-10-26 14:02:01 +000030
Guido van Rossum63b08ac2000-06-29 14:13:28 +000031 use_statcache -- Do not stat() each file directly: go through
32 the statcache module for more efficiency.
Guido van Rossum2d726871999-10-26 14:02:01 +000033
Guido van Rossum63b08ac2000-06-29 14:13:28 +000034 Return value:
Guido van Rossum2d726871999-10-26 14:02:01 +000035
Guido van Rossum63b08ac2000-06-29 14:13:28 +000036 integer -- 1 if the files are the same, 0 otherwise.
Guido van Rossum2d726871999-10-26 14:02:01 +000037
Guido van Rossum63b08ac2000-06-29 14:13:28 +000038 This function uses a cache for past comparisons and the results,
39 with a cache invalidation mechanism relying on stale signatures.
40 Of course, if 'use_statcache' is true, this mechanism is defeated,
41 and the cache will never grow stale.
Guido van Rossum2d726871999-10-26 14:02:01 +000042
Guido van Rossum63b08ac2000-06-29 14:13:28 +000043 """
44 if use_statcache:
45 stat_function = statcache.stat
46 else:
47 stat_function = os.stat
48 s1 = _sig(stat_function(f1))
49 s2 = _sig(stat_function(f2))
50 if s1[0] != stat.S_IFREG or s2[0] != stat.S_IFREG:
51 return 0
52 if shallow and s1 == s2:
53 return 1
54 if s1[1] != s2[1]:
55 return 0
Guido van Rossum2d726871999-10-26 14:02:01 +000056
Guido van Rossum63b08ac2000-06-29 14:13:28 +000057 result = _cache.get((f1, f2))
58 if result and (s1, s2) == result[:2]:
59 return result[2]
60 outcome = _do_cmp(f1, f2)
61 _cache[f1, f2] = s1, s2, outcome
62 return outcome
Guido van Rossum2d726871999-10-26 14:02:01 +000063
64def _sig(st):
Guido van Rossum63b08ac2000-06-29 14:13:28 +000065 return (stat.S_IFMT(st[stat.ST_MODE]),
66 st[stat.ST_SIZE],
67 st[stat.ST_MTIME])
Guido van Rossum2d726871999-10-26 14:02:01 +000068
69def _do_cmp(f1, f2):
Guido van Rossum63b08ac2000-06-29 14:13:28 +000070 bufsize = BUFSIZE
71 fp1 = open(f1, 'rb')
72 fp2 = open(f2, 'rb')
73 while 1:
74 b1 = fp1.read(bufsize)
75 b2 = fp2.read(bufsize)
76 if b1 != b2:
77 return 0
78 if not b1:
79 return 1
80
81# Directory comparison class.
82#
83class dircmp:
84 """A class that manages the comparison of 2 directories.
85
86 dircmp(a,b,ignore=None,hide=None)
87 A and B are directories.
88 IGNORE is a list of names to ignore,
89 defaults to ['RCS', 'CVS', 'tags'].
90 HIDE is a list of names to hide,
91 defaults to [os.curdir, os.pardir].
92
93 High level usage:
94 x = dircmp(dir1, dir2)
95 x.report() -> prints a report on the differences between dir1 and dir2
96 or
97 x.report_partial_closure() -> prints report on differences between dir1
98 and dir2, and reports on common immediate subdirectories.
99 x.report_full_closure() -> like report_partial_closure,
100 but fully recursive.
101
102 Attributes:
103 left_list, right_list: The files in dir1 and dir2,
104 filtered by hide and ignore.
105 common: a list of names in both dir1 and dir2.
106 left_only, right_only: names only in dir1, dir2.
107 common_dirs: subdirectories in both dir1 and dir2.
108 common_files: files in both dir1 and dir2.
109 common_funny: names in both dir1 and dir2 where the type differs between
110 dir1 and dir2, or the name is not stat-able.
111 same_files: list of identical files.
112 diff_files: list of filenames which differ.
113 funny_files: list of files which could not be compared.
114 subdirs: a dictionary of dircmp objects, keyed by names in common_dirs.
115 """
116
117 def __init__(self, a, b, ignore=None, hide=None): # Initialize
118 self.left = a
119 self.right = b
120 if hide is None:
121 self.hide = [os.curdir, os.pardir] # Names never to be shown
122 else:
123 self.hide = hide
124 if ignore is None:
125 self.ignore = ['RCS', 'CVS', 'tags'] # Names ignored in comparison
126 else:
127 self.ignore = ignore
128
129 def phase0(self): # Compare everything except common subdirectories
130 self.left_list = _filter(os.listdir(self.left),
131 self.hide+self.ignore)
132 self.right_list = _filter(os.listdir(self.right),
133 self.hide+self.ignore)
134 self.left_list.sort()
135 self.right_list.sort()
136
137 __p4_attrs = ('subdirs',)
138 __p3_attrs = ('same_files', 'diff_files', 'funny_files')
139 __p2_attrs = ('common_dirs', 'common_files', 'common_funny')
140 __p1_attrs = ('common', 'left_only', 'right_only')
141 __p0_attrs = ('left_list', 'right_list')
142
143 def __getattr__(self, attr):
144 if attr in self.__p4_attrs:
145 self.phase4()
146 elif attr in self.__p3_attrs:
147 self.phase3()
148 elif attr in self.__p2_attrs:
149 self.phase2()
150 elif attr in self.__p1_attrs:
151 self.phase1()
152 elif attr in self.__p0_attrs:
153 self.phase0()
154 else:
155 raise AttributeError, attr
156 return getattr(self, attr)
157
158 def phase1(self): # Compute common names
159 a_only, b_only = [], []
160 common = {}
161 b = {}
162 for fnm in self.right_list:
163 b[fnm] = 1
164 for x in self.left_list:
165 if b.get(x, 0):
166 common[x] = 1
167 else:
168 a_only.append(x)
169 for x in self.right_list:
170 if common.get(x, 0):
171 pass
172 else:
173 b_only.append(x)
174 self.common = common.keys()
175 self.left_only = a_only
176 self.right_only = b_only
177
178 def phase2(self): # Distinguish files, directories, funnies
179 self.common_dirs = []
180 self.common_files = []
181 self.common_funny = []
182
183 for x in self.common:
184 a_path = os.path.join(self.left, x)
185 b_path = os.path.join(self.right, x)
186
187 ok = 1
188 try:
189 a_stat = statcache.stat(a_path)
190 except os.error, why:
191 # print 'Can\'t stat', a_path, ':', why[1]
192 ok = 0
193 try:
194 b_stat = statcache.stat(b_path)
195 except os.error, why:
196 # print 'Can\'t stat', b_path, ':', why[1]
197 ok = 0
198
199 if ok:
200 a_type = stat.S_IFMT(a_stat[stat.ST_MODE])
201 b_type = stat.S_IFMT(b_stat[stat.ST_MODE])
Fred Drake8152d322000-12-12 23:20:45 +0000202 if a_type != b_type:
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000203 self.common_funny.append(x)
204 elif stat.S_ISDIR(a_type):
205 self.common_dirs.append(x)
206 elif stat.S_ISREG(a_type):
207 self.common_files.append(x)
208 else:
209 self.common_funny.append(x)
210 else:
211 self.common_funny.append(x)
212
213 def phase3(self): # Find out differences between common files
214 xx = cmpfiles(self.left, self.right, self.common_files)
215 self.same_files, self.diff_files, self.funny_files = xx
216
217 def phase4(self): # Find out differences between common subdirectories
218 # A new dircmp object is created for each common subdirectory,
219 # these are stored in a dictionary indexed by filename.
220 # The hide and ignore properties are inherited from the parent
221 self.subdirs = {}
222 for x in self.common_dirs:
223 a_x = os.path.join(self.left, x)
224 b_x = os.path.join(self.right, x)
225 self.subdirs[x] = dircmp(a_x, b_x, self.ignore, self.hide)
226
227 def phase4_closure(self): # Recursively call phase4() on subdirectories
228 self.phase4()
229 for x in self.subdirs.keys():
230 self.subdirs[x].phase4_closure()
231
232 def report(self): # Print a report on the differences between a and b
233 # Output format is purposely lousy
234 print 'diff', self.left, self.right
235 if self.left_only:
236 self.left_only.sort()
237 print 'Only in', self.left, ':', self.left_only
238 if self.right_only:
239 self.right_only.sort()
240 print 'Only in', self.right, ':', self.right_only
241 if self.same_files:
242 self.same_files.sort()
243 print 'Identical files :', self.same_files
244 if self.diff_files:
245 self.diff_files.sort()
246 print 'Differing files :', self.diff_files
247 if self.funny_files:
248 self.funny_files.sort()
249 print 'Trouble with common files :', self.funny_files
250 if self.common_dirs:
251 self.common_dirs.sort()
252 print 'Common subdirectories :', self.common_dirs
253 if self.common_funny:
254 self.common_funny.sort()
255 print 'Common funny cases :', self.common_funny
256
257 def report_partial_closure(self): # Print reports on self and on subdirs
258 self.report()
259 for x in self.subdirs.keys():
260 print
261 self.subdirs[x].report()
262
263 def report_full_closure(self): # Report on self and subdirs recursively
264 self.report()
265 for x in self.subdirs.keys():
266 print
267 self.subdirs[x].report_full_closure()
268
269
Fred Drake2b0d98b2000-07-03 08:18:47 +0000270def cmpfiles(a, b, common, shallow=1, use_statcache=0):
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000271 """Compare common files in two directories.
272
Fred Drake2b0d98b2000-07-03 08:18:47 +0000273 a, b -- directory names
274 common -- list of file names found in both directories
275 shallow -- if true, do comparison based solely on stat() information
276 use_statcache -- if true, use statcache.stat() instead of os.stat()
277
278 Returns a tuple of three lists:
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000279 files that compare equal
280 files that are different
Fred Drake2b0d98b2000-07-03 08:18:47 +0000281 filenames that aren't regular files.
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000282
Fred Drake2b0d98b2000-07-03 08:18:47 +0000283 """
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000284 res = ([], [], [])
285 for x in common:
Fred Drake2b0d98b2000-07-03 08:18:47 +0000286 ax = os.path.join(a, x)
287 bx = os.path.join(b, x)
288 res[_cmp(ax, bx, shallow, use_statcache)].append(x)
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000289 return res
290
291
292# Compare two files.
293# Return:
Tim Peters88869f92001-01-14 23:36:06 +0000294# 0 for equal
295# 1 for different
296# 2 for funny cases (can't stat, etc.)
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000297#
Moshe Zadkaafb17fc2000-12-03 20:48:07 +0000298def _cmp(a, b, sh, st):
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000299 try:
Moshe Zadkaafb17fc2000-12-03 20:48:07 +0000300 return not abs(cmp(a, b, sh, st))
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000301 except os.error:
302 return 2
303
304
305# Return a copy with items that occur in skip removed.
306#
307def _filter(list, skip):
308 result = []
309 for item in list:
310 if item not in skip: result.append(item)
311 return result
312
313
314# Demonstration and testing.
315#
316def demo():
317 import sys
318 import getopt
319 options, args = getopt.getopt(sys.argv[1:], 'r')
Fred Drake8152d322000-12-12 23:20:45 +0000320 if len(args) != 2:
321 raise getopt.error, 'need exactly two args'
Guido van Rossum63b08ac2000-06-29 14:13:28 +0000322 dd = dircmp(args[0], args[1])
323 if ('-r', '') in options:
324 dd.report_full_closure()
325 else:
326 dd.report()
327
328if __name__ == '__main__':
329 demo()