blob: 7060c99bf2a19fe3844b3ea4ae1737cdbe0eec4a [file] [log] [blame]
Guido van Rossum43265be2000-02-03 00:41:22 +00001"""Utilities for comparing files and directories.
Guido van Rossum2d726871999-10-26 14:02:01 +00002
Guido van Rossum43265be2000-02-03 00:41:22 +00003Classes:
4 dircmp
5
6Functions:
7 cmp(f1, f2, shallow=1, use_statcache=0) -> int
8 cmpfiles(a, b, common) -> ([], [], [])
9
10"""
11
12import os
13import stat
14import statcache
Guido van Rossum2d726871999-10-26 14:02:01 +000015
16_cache = {}
17BUFSIZE=8*1024
18
Guido van Rossum43265be2000-02-03 00:41:22 +000019def cmp(f1, f2, shallow=1,use_statcache=0):
20 """Compare two files.
Guido van Rossum2d726871999-10-26 14:02:01 +000021
Guido van Rossum43265be2000-02-03 00:41:22 +000022 Arguments:
Guido van Rossum2d726871999-10-26 14:02:01 +000023
Guido van Rossum43265be2000-02-03 00:41:22 +000024 f1 -- First file name
Guido van Rossum2d726871999-10-26 14:02:01 +000025
Guido van Rossum43265be2000-02-03 00:41:22 +000026 f2 -- Second file name
Guido van Rossum2d726871999-10-26 14:02:01 +000027
Guido van Rossum43265be2000-02-03 00:41:22 +000028 shallow -- Just check stat signature (do not read the files).
29 defaults to 1.
Guido van Rossum2d726871999-10-26 14:02:01 +000030
Guido van Rossum43265be2000-02-03 00:41:22 +000031 use_statcache -- Do not stat() each file directly: go through
32 the statcache module for more efficiency.
Guido van Rossum2d726871999-10-26 14:02:01 +000033
Guido van Rossum43265be2000-02-03 00:41:22 +000034 Return value:
Guido van Rossum2d726871999-10-26 14:02:01 +000035
Guido van Rossum43265be2000-02-03 00:41:22 +000036 integer -- 1 if the files are the same, 0 otherwise.
Guido van Rossum2d726871999-10-26 14:02:01 +000037
Guido van Rossum43265be2000-02-03 00:41:22 +000038 This function uses a cache for past comparisons and the results,
39 with a cache invalidation mechanism relying on stale signatures.
40 Of course, if 'use_statcache' is true, this mechanism is defeated,
41 and the cache will never grow stale.
Guido van Rossum2d726871999-10-26 14:02:01 +000042
Guido van Rossum43265be2000-02-03 00:41:22 +000043 """
44 stat_function = (os.stat, statcache.stat)[use_statcache]
45 s1, s2 = _sig(stat_function(f1)), _sig(stat_function(f2))
46 if s1[0]!=stat.S_IFREG or s2[0]!=stat.S_IFREG: return 0
47 if shallow and s1 == s2: return 1
48 if s1[1]!=s2[1]: return 0
Guido van Rossum2d726871999-10-26 14:02:01 +000049
Guido van Rossum43265be2000-02-03 00:41:22 +000050 result = _cache.get((f1, f2))
51 if result and (s1, s2)==result[:2]:
52 return result[2]
53 outcome = _do_cmp(f1, f2)
54 _cache[f1, f2] = s1, s2, outcome
55 return outcome
Guido van Rossum2d726871999-10-26 14:02:01 +000056
57def _sig(st):
Guido van Rossum43265be2000-02-03 00:41:22 +000058 return (stat.S_IFMT(st[stat.ST_MODE]),
59 st[stat.ST_SIZE],
60 st[stat.ST_MTIME])
Guido van Rossum2d726871999-10-26 14:02:01 +000061
62def _do_cmp(f1, f2):
Guido van Rossum43265be2000-02-03 00:41:22 +000063 bufsize = BUFSIZE
64 fp1 , fp2 = open(f1, 'rb'), open(f2, 'rb')
65 while 1:
66 b1, b2 = fp1.read(bufsize), fp2.read(bufsize)
67 if b1!=b2: return 0
68 if not b1: return 1
69
70# Directory comparison class.
71#
72class dircmp:
73 """A class that manages the comparison of 2 directories.
74
75 dircmp(a,b,ignore=None,hide=None)
76 A and B are directories.
77 IGNORE is a list of names to ignore,
78 defaults to ['RCS', 'CVS', 'tags'].
79 HIDE is a list of names to hide,
80 defaults to [os.curdir, os.pardir].
81
82 High level usage:
83 x = dircmp(dir1, dir2)
84 x.report() -> prints a report on the differences between dir1 and dir2
85 or
86 x.report_partial_closure() -> prints report on differences between dir1
87 and dir2, and reports on common immediate subdirectories.
88 x.report_full_closure() -> like report_partial_closure,
89 but fully recursive.
90
91 Attributes:
92 left_list, right_list: The files in dir1 and dir2,
93 filtered by hide and ignore.
94 common: a list of names in both dir1 and dir2.
95 left_only, right_only: names only in dir1, dir2.
96 common_dirs: subdirectories in both dir1 and dir2.
97 common_files: files in both dir1 and dir2.
98 common_funny: names in both dir1 and dir2 where the type differs between
99 dir1 and dir2, or the name is not stat-able.
100 same_files: list of identical files.
101 diff_files: list of filenames which differ.
102 funny_files: list of files which could not be compared.
103 subdirs: a dictionary of dircmp objects, keyed by names in common_dirs.
104 """
105
106 def __init__(self, a, b, ignore=None, hide=None): # Initialize
107 self.left = a
108 self.right = b
109 if hide is None:
110 self.hide = [os.curdir, os.pardir] # Names never to be shown
111 else:
112 self.hide = hide
113 if ignore is None:
114 self.ignore = ['RCS', 'CVS', 'tags'] # Names ignored in comparison
115 else:
116 self.ignore = ignore
117
118 def phase0(self): # Compare everything except common subdirectories
119 self.left_list = _filter(os.listdir(self.left),
120 self.hide+self.ignore)
121 self.right_list = _filter(os.listdir(self.right),
122 self.hide+self.ignore)
123 self.left_list.sort()
124 self.right_list.sort()
125
126 __p4_attrs = ('subdirs',)
127 __p3_attrs = ('same_files', 'diff_files', 'funny_files')
128 __p2_attrs = ('common_dirs', 'common_files', 'common_funny')
129 __p1_attrs = ('common', 'left_only', 'right_only')
130 __p0_attrs = ('left_list', 'right_list')
131
132 def __getattr__(self, attr):
133 if attr in self.__p4_attrs:
134 self.phase4()
135 elif attr in self.__p3_attrs:
136 self.phase3()
137 elif attr in self.__p2_attrs:
138 self.phase2()
139 elif attr in self.__p1_attrs:
140 self.phase1()
141 elif attr in self.__p0_attrs:
142 self.phase0()
143 else:
144 raise AttributeError, attr
145 return getattr(self, attr)
146
147 def phase1(self): # Compute common names
148 a_only, b_only = [], []
149 common = {}
150 b = {}
151 for fnm in self.right_list:
152 b[fnm] = 1
153 for x in self.left_list:
154 if b.get(x, 0):
155 common[x] = 1
156 else:
157 a_only.append(x)
158 for x in self.right_list:
159 if common.get(x, 0):
160 pass
161 else:
162 b_only.append(x)
163 self.common = common.keys()
164 self.left_only = a_only
165 self.right_only = b_only
166
167 def phase2(self): # Distinguish files, directories, funnies
168 self.common_dirs = []
169 self.common_files = []
170 self.common_funny = []
171
172 for x in self.common:
173 a_path = os.path.join(self.left, x)
174 b_path = os.path.join(self.right, x)
175
176 ok = 1
177 try:
178 a_stat = statcache.stat(a_path)
179 except os.error, why:
180 # print 'Can\'t stat', a_path, ':', why[1]
181 ok = 0
182 try:
183 b_stat = statcache.stat(b_path)
184 except os.error, why:
185 # print 'Can\'t stat', b_path, ':', why[1]
186 ok = 0
187
188 if ok:
189 a_type = stat.S_IFMT(a_stat[stat.ST_MODE])
190 b_type = stat.S_IFMT(b_stat[stat.ST_MODE])
191 if a_type <> b_type:
192 self.common_funny.append(x)
193 elif stat.S_ISDIR(a_type):
194 self.common_dirs.append(x)
195 elif stat.S_ISREG(a_type):
196 self.common_files.append(x)
197 else:
198 self.common_funny.append(x)
199 else:
200 self.common_funny.append(x)
201
202 def phase3(self): # Find out differences between common files
203 xx = cmpfiles(self.left, self.right, self.common_files)
204 self.same_files, self.diff_files, self.funny_files = xx
205
206 def phase4(self): # Find out differences between common subdirectories
207 # A new dircmp object is created for each common subdirectory,
208 # these are stored in a dictionary indexed by filename.
209 # The hide and ignore properties are inherited from the parent
210 self.subdirs = {}
211 for x in self.common_dirs:
212 a_x = os.path.join(self.left, x)
213 b_x = os.path.join(self.right, x)
214 self.subdirs[x] = dircmp(a_x, b_x, self.ignore, self.hide)
215
216 def phase4_closure(self): # Recursively call phase4() on subdirectories
217 self.phase4()
218 for x in self.subdirs.keys():
219 self.subdirs[x].phase4_closure()
220
221 def report(self): # Print a report on the differences between a and b
222 # Output format is purposely lousy
223 print 'diff', self.left, self.right
224 if self.left_only:
225 self.left_only.sort()
226 print 'Only in', self.left, ':', self.left_only
227 if self.right_only:
228 self.right_only.sort()
229 print 'Only in', self.right, ':', self.right_only
230 if self.same_files:
231 self.same_files.sort()
232 print 'Identical files :', self.same_files
233 if self.diff_files:
234 self.diff_files.sort()
235 print 'Differing files :', self.diff_files
236 if self.funny_files:
237 self.funny_files.sort()
238 print 'Trouble with common files :', self.funny_files
239 if self.common_dirs:
240 self.common_dirs.sort()
241 print 'Common subdirectories :', self.common_dirs
242 if self.common_funny:
243 self.common_funny.sort()
244 print 'Common funny cases :', self.common_funny
245
246 def report_partial_closure(self): # Print reports on self and on subdirs
247 self.report()
248 for x in self.subdirs.keys():
249 print
250 self.subdirs[x].report()
251
252 def report_full_closure(self): # Report on self and subdirs recursively
253 self.report()
254 for x in self.subdirs.keys():
255 print
256 self.subdirs[x].report_full_closure()
257
258
259# Compare common files in two directories.
260# Return:
261# - files that compare equal
262# - files that compare different
263# - funny cases (can't stat etc.)
264#
265def cmpfiles(a, b, common):
266 """Compare common files in two directories.
267
268 cmpfiles(a,b,common)
269 A and B are directory names
270 COMMON is a list of file names
271 returns a tuple of three lists:
272 files that compare equal
273 files that are different
274 filenames that aren't regular files."""
275
276 res = ([], [], [])
277 for x in common:
278 res[_cmp(os.path.join(a, x), os.path.join(b, x))].append(x)
279 return res
280
281
282# Compare two files.
283# Return:
284# 0 for equal
285# 1 for different
286# 2 for funny cases (can't stat, etc.)
287#
288def _cmp(a, b):
289 try:
290 return not abs(cmp(a, b))
291 except os.error:
292 return 2
293
294
295# Return a copy with items that occur in skip removed.
296#
297def _filter(list, skip):
298 result = []
299 for item in list:
300 if item not in skip: result.append(item)
301 return result
302
303
304# Demonstration and testing.
305#
306def demo():
307 import sys
308 import getopt
309 options, args = getopt.getopt(sys.argv[1:], 'r')
310 if len(args) <> 2: raise getopt.error, 'need exactly two args'
311 dd = dircmp(args[0], args[1])
312 if ('-r', '') in options:
313 dd.report_full_closure()
314 else:
315 dd.report()
316
317if __name__ == '__main__':
318 demo()