| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 1 | """Utilities for comparing files and directories. | 
| Guido van Rossum | 2d72687 | 1999-10-26 14:02:01 +0000 | [diff] [blame] | 2 |  | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 3 | Classes: | 
 | 4 |     dircmp | 
 | 5 |  | 
 | 6 | Functions: | 
 | 7 |     cmp(f1, f2, shallow=1, use_statcache=0) -> int | 
 | 8 |     cmpfiles(a, b, common) -> ([], [], []) | 
 | 9 |  | 
 | 10 | """ | 
 | 11 |  | 
 | 12 | import os | 
 | 13 | import stat | 
 | 14 | import statcache | 
| Guido van Rossum | 2d72687 | 1999-10-26 14:02:01 +0000 | [diff] [blame] | 15 |  | 
| Skip Montanaro | eccd02a | 2001-01-20 23:34:12 +0000 | [diff] [blame] | 16 | __all__ = ["cmp","dircmp","cmpfiles"] | 
 | 17 |  | 
| Guido van Rossum | 2d72687 | 1999-10-26 14:02:01 +0000 | [diff] [blame] | 18 | _cache = {} | 
 | 19 | BUFSIZE=8*1024 | 
 | 20 |  | 
| Fred Drake | 2b0d98b | 2000-07-03 08:18:47 +0000 | [diff] [blame] | 21 | def cmp(f1, f2, shallow=1, use_statcache=0): | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 22 |     """Compare two files. | 
| Guido van Rossum | 2d72687 | 1999-10-26 14:02:01 +0000 | [diff] [blame] | 23 |  | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 24 |     Arguments: | 
| Guido van Rossum | 2d72687 | 1999-10-26 14:02:01 +0000 | [diff] [blame] | 25 |  | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 26 |     f1 -- First file name | 
| Guido van Rossum | 2d72687 | 1999-10-26 14:02:01 +0000 | [diff] [blame] | 27 |  | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 28 |     f2 -- Second file name | 
| Guido van Rossum | 2d72687 | 1999-10-26 14:02:01 +0000 | [diff] [blame] | 29 |  | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 30 |     shallow -- Just check stat signature (do not read the files). | 
 | 31 |                defaults to 1. | 
| Guido van Rossum | 2d72687 | 1999-10-26 14:02:01 +0000 | [diff] [blame] | 32 |  | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 33 |     use_statcache -- Do not stat() each file directly: go through | 
 | 34 |                      the statcache module for more efficiency. | 
| Guido van Rossum | 2d72687 | 1999-10-26 14:02:01 +0000 | [diff] [blame] | 35 |  | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 36 |     Return value: | 
| Guido van Rossum | 2d72687 | 1999-10-26 14:02:01 +0000 | [diff] [blame] | 37 |  | 
| Tim Peters | bc0e910 | 2002-04-04 22:55:58 +0000 | [diff] [blame] | 38 |     True if the files are the same, False otherwise. | 
| Guido van Rossum | 2d72687 | 1999-10-26 14:02:01 +0000 | [diff] [blame] | 39 |  | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 40 |     This function uses a cache for past comparisons and the results, | 
 | 41 |     with a cache invalidation mechanism relying on stale signatures. | 
 | 42 |     Of course, if 'use_statcache' is true, this mechanism is defeated, | 
 | 43 |     and the cache will never grow stale. | 
| Guido van Rossum | 2d72687 | 1999-10-26 14:02:01 +0000 | [diff] [blame] | 44 |  | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 45 |     """ | 
 | 46 |     if use_statcache: | 
 | 47 |         stat_function = statcache.stat | 
 | 48 |     else: | 
 | 49 |         stat_function = os.stat | 
 | 50 |     s1 = _sig(stat_function(f1)) | 
 | 51 |     s2 = _sig(stat_function(f2)) | 
 | 52 |     if s1[0] != stat.S_IFREG or s2[0] != stat.S_IFREG: | 
| Tim Peters | bc0e910 | 2002-04-04 22:55:58 +0000 | [diff] [blame] | 53 |         return False | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 54 |     if shallow and s1 == s2: | 
| Tim Peters | bc0e910 | 2002-04-04 22:55:58 +0000 | [diff] [blame] | 55 |         return True | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 56 |     if s1[1] != s2[1]: | 
| Tim Peters | bc0e910 | 2002-04-04 22:55:58 +0000 | [diff] [blame] | 57 |         return False | 
| Guido van Rossum | 2d72687 | 1999-10-26 14:02:01 +0000 | [diff] [blame] | 58 |  | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 59 |     result = _cache.get((f1, f2)) | 
 | 60 |     if result and (s1, s2) == result[:2]: | 
 | 61 |         return result[2] | 
 | 62 |     outcome = _do_cmp(f1, f2) | 
 | 63 |     _cache[f1, f2] = s1, s2, outcome | 
 | 64 |     return outcome | 
| Guido van Rossum | 2d72687 | 1999-10-26 14:02:01 +0000 | [diff] [blame] | 65 |  | 
 | 66 | def _sig(st): | 
| Raymond Hettinger | 32200ae | 2002-06-01 19:51:15 +0000 | [diff] [blame] | 67 |     return (stat.S_IFMT(st.st_mode), | 
 | 68 |             st.st_size, | 
 | 69 |             st.st_mtime) | 
| Guido van Rossum | 2d72687 | 1999-10-26 14:02:01 +0000 | [diff] [blame] | 70 |  | 
 | 71 | def _do_cmp(f1, f2): | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 72 |     bufsize = BUFSIZE | 
 | 73 |     fp1 = open(f1, 'rb') | 
 | 74 |     fp2 = open(f2, 'rb') | 
 | 75 |     while 1: | 
 | 76 |         b1 = fp1.read(bufsize) | 
 | 77 |         b2 = fp2.read(bufsize) | 
 | 78 |         if b1 != b2: | 
 | 79 |             return 0 | 
 | 80 |         if not b1: | 
 | 81 |             return 1 | 
 | 82 |  | 
 | 83 | # Directory comparison class. | 
 | 84 | # | 
 | 85 | class dircmp: | 
 | 86 |     """A class that manages the comparison of 2 directories. | 
 | 87 |  | 
 | 88 |     dircmp(a,b,ignore=None,hide=None) | 
 | 89 |       A and B are directories. | 
 | 90 |       IGNORE is a list of names to ignore, | 
 | 91 |         defaults to ['RCS', 'CVS', 'tags']. | 
 | 92 |       HIDE is a list of names to hide, | 
 | 93 |         defaults to [os.curdir, os.pardir]. | 
 | 94 |  | 
 | 95 |     High level usage: | 
 | 96 |       x = dircmp(dir1, dir2) | 
 | 97 |       x.report() -> prints a report on the differences between dir1 and dir2 | 
 | 98 |        or | 
 | 99 |       x.report_partial_closure() -> prints report on differences between dir1 | 
 | 100 |             and dir2, and reports on common immediate subdirectories. | 
 | 101 |       x.report_full_closure() -> like report_partial_closure, | 
 | 102 |             but fully recursive. | 
 | 103 |  | 
 | 104 |     Attributes: | 
 | 105 |      left_list, right_list: The files in dir1 and dir2, | 
 | 106 |         filtered by hide and ignore. | 
 | 107 |      common: a list of names in both dir1 and dir2. | 
 | 108 |      left_only, right_only: names only in dir1, dir2. | 
 | 109 |      common_dirs: subdirectories in both dir1 and dir2. | 
 | 110 |      common_files: files in both dir1 and dir2. | 
 | 111 |      common_funny: names in both dir1 and dir2 where the type differs between | 
 | 112 |         dir1 and dir2, or the name is not stat-able. | 
 | 113 |      same_files: list of identical files. | 
 | 114 |      diff_files: list of filenames which differ. | 
 | 115 |      funny_files: list of files which could not be compared. | 
 | 116 |      subdirs: a dictionary of dircmp objects, keyed by names in common_dirs. | 
 | 117 |      """ | 
 | 118 |  | 
 | 119 |     def __init__(self, a, b, ignore=None, hide=None): # Initialize | 
 | 120 |         self.left = a | 
 | 121 |         self.right = b | 
 | 122 |         if hide is None: | 
 | 123 |             self.hide = [os.curdir, os.pardir] # Names never to be shown | 
 | 124 |         else: | 
 | 125 |             self.hide = hide | 
 | 126 |         if ignore is None: | 
 | 127 |             self.ignore = ['RCS', 'CVS', 'tags'] # Names ignored in comparison | 
 | 128 |         else: | 
 | 129 |             self.ignore = ignore | 
 | 130 |  | 
 | 131 |     def phase0(self): # Compare everything except common subdirectories | 
 | 132 |         self.left_list = _filter(os.listdir(self.left), | 
 | 133 |                                  self.hide+self.ignore) | 
 | 134 |         self.right_list = _filter(os.listdir(self.right), | 
 | 135 |                                   self.hide+self.ignore) | 
 | 136 |         self.left_list.sort() | 
 | 137 |         self.right_list.sort() | 
 | 138 |  | 
 | 139 |     __p4_attrs = ('subdirs',) | 
 | 140 |     __p3_attrs = ('same_files', 'diff_files', 'funny_files') | 
 | 141 |     __p2_attrs = ('common_dirs', 'common_files', 'common_funny') | 
 | 142 |     __p1_attrs = ('common', 'left_only', 'right_only') | 
 | 143 |     __p0_attrs = ('left_list', 'right_list') | 
 | 144 |  | 
 | 145 |     def __getattr__(self, attr): | 
 | 146 |         if attr in self.__p4_attrs: | 
 | 147 |             self.phase4() | 
 | 148 |         elif attr in self.__p3_attrs: | 
 | 149 |             self.phase3() | 
 | 150 |         elif attr in self.__p2_attrs: | 
 | 151 |             self.phase2() | 
 | 152 |         elif attr in self.__p1_attrs: | 
 | 153 |             self.phase1() | 
 | 154 |         elif attr in self.__p0_attrs: | 
 | 155 |             self.phase0() | 
 | 156 |         else: | 
 | 157 |             raise AttributeError, attr | 
 | 158 |         return getattr(self, attr) | 
 | 159 |  | 
 | 160 |     def phase1(self): # Compute common names | 
 | 161 |         a_only, b_only = [], [] | 
 | 162 |         common = {} | 
 | 163 |         b = {} | 
 | 164 |         for fnm in self.right_list: | 
 | 165 |             b[fnm] = 1 | 
 | 166 |         for x in self.left_list: | 
 | 167 |             if b.get(x, 0): | 
 | 168 |                 common[x] = 1 | 
 | 169 |             else: | 
 | 170 |                 a_only.append(x) | 
 | 171 |         for x in self.right_list: | 
 | 172 |             if common.get(x, 0): | 
 | 173 |                 pass | 
 | 174 |             else: | 
 | 175 |                 b_only.append(x) | 
 | 176 |         self.common = common.keys() | 
 | 177 |         self.left_only = a_only | 
 | 178 |         self.right_only = b_only | 
 | 179 |  | 
 | 180 |     def phase2(self): # Distinguish files, directories, funnies | 
 | 181 |         self.common_dirs = [] | 
 | 182 |         self.common_files = [] | 
 | 183 |         self.common_funny = [] | 
 | 184 |  | 
 | 185 |         for x in self.common: | 
 | 186 |             a_path = os.path.join(self.left, x) | 
 | 187 |             b_path = os.path.join(self.right, x) | 
 | 188 |  | 
 | 189 |             ok = 1 | 
 | 190 |             try: | 
 | 191 |                 a_stat = statcache.stat(a_path) | 
 | 192 |             except os.error, why: | 
 | 193 |                 # print 'Can\'t stat', a_path, ':', why[1] | 
 | 194 |                 ok = 0 | 
 | 195 |             try: | 
 | 196 |                 b_stat = statcache.stat(b_path) | 
 | 197 |             except os.error, why: | 
 | 198 |                 # print 'Can\'t stat', b_path, ':', why[1] | 
 | 199 |                 ok = 0 | 
 | 200 |  | 
 | 201 |             if ok: | 
| Raymond Hettinger | 32200ae | 2002-06-01 19:51:15 +0000 | [diff] [blame] | 202 |                 a_type = stat.S_IFMT(a_stat.st_mode) | 
 | 203 |                 b_type = stat.S_IFMT(b_stat.st_mode) | 
| Fred Drake | 8152d32 | 2000-12-12 23:20:45 +0000 | [diff] [blame] | 204 |                 if a_type != b_type: | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 205 |                     self.common_funny.append(x) | 
 | 206 |                 elif stat.S_ISDIR(a_type): | 
 | 207 |                     self.common_dirs.append(x) | 
 | 208 |                 elif stat.S_ISREG(a_type): | 
 | 209 |                     self.common_files.append(x) | 
 | 210 |                 else: | 
 | 211 |                     self.common_funny.append(x) | 
 | 212 |             else: | 
 | 213 |                 self.common_funny.append(x) | 
 | 214 |  | 
 | 215 |     def phase3(self): # Find out differences between common files | 
 | 216 |         xx = cmpfiles(self.left, self.right, self.common_files) | 
 | 217 |         self.same_files, self.diff_files, self.funny_files = xx | 
 | 218 |  | 
 | 219 |     def phase4(self): # Find out differences between common subdirectories | 
 | 220 |         # A new dircmp object is created for each common subdirectory, | 
 | 221 |         # these are stored in a dictionary indexed by filename. | 
 | 222 |         # The hide and ignore properties are inherited from the parent | 
 | 223 |         self.subdirs = {} | 
 | 224 |         for x in self.common_dirs: | 
 | 225 |             a_x = os.path.join(self.left, x) | 
 | 226 |             b_x = os.path.join(self.right, x) | 
 | 227 |             self.subdirs[x]  = dircmp(a_x, b_x, self.ignore, self.hide) | 
 | 228 |  | 
 | 229 |     def phase4_closure(self): # Recursively call phase4() on subdirectories | 
 | 230 |         self.phase4() | 
| Raymond Hettinger | e0d4972 | 2002-06-02 18:55:56 +0000 | [diff] [blame] | 231 |         for sd in self.subdirs.itervalues(): | 
 | 232 |             sd.phase4_closure() | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 233 |  | 
 | 234 |     def report(self): # Print a report on the differences between a and b | 
 | 235 |         # Output format is purposely lousy | 
 | 236 |         print 'diff', self.left, self.right | 
 | 237 |         if self.left_only: | 
 | 238 |             self.left_only.sort() | 
 | 239 |             print 'Only in', self.left, ':', self.left_only | 
 | 240 |         if self.right_only: | 
 | 241 |             self.right_only.sort() | 
 | 242 |             print 'Only in', self.right, ':', self.right_only | 
 | 243 |         if self.same_files: | 
 | 244 |             self.same_files.sort() | 
 | 245 |             print 'Identical files :', self.same_files | 
 | 246 |         if self.diff_files: | 
 | 247 |             self.diff_files.sort() | 
 | 248 |             print 'Differing files :', self.diff_files | 
 | 249 |         if self.funny_files: | 
 | 250 |             self.funny_files.sort() | 
 | 251 |             print 'Trouble with common files :', self.funny_files | 
 | 252 |         if self.common_dirs: | 
 | 253 |             self.common_dirs.sort() | 
 | 254 |             print 'Common subdirectories :', self.common_dirs | 
 | 255 |         if self.common_funny: | 
 | 256 |             self.common_funny.sort() | 
 | 257 |             print 'Common funny cases :', self.common_funny | 
 | 258 |  | 
 | 259 |     def report_partial_closure(self): # Print reports on self and on subdirs | 
 | 260 |         self.report() | 
| Raymond Hettinger | e0d4972 | 2002-06-02 18:55:56 +0000 | [diff] [blame] | 261 |         for sd in self.subdirs.itervalues(): | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 262 |             print | 
| Raymond Hettinger | e0d4972 | 2002-06-02 18:55:56 +0000 | [diff] [blame] | 263 |             sd.report() | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 264 |  | 
 | 265 |     def report_full_closure(self): # Report on self and subdirs recursively | 
 | 266 |         self.report() | 
| Raymond Hettinger | e0d4972 | 2002-06-02 18:55:56 +0000 | [diff] [blame] | 267 |         for sd in self.subdirs.itervalues(): | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 268 |             print | 
| Raymond Hettinger | e0d4972 | 2002-06-02 18:55:56 +0000 | [diff] [blame] | 269 |             sd.report_full_closure() | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 270 |  | 
 | 271 |  | 
| Fred Drake | 2b0d98b | 2000-07-03 08:18:47 +0000 | [diff] [blame] | 272 | def cmpfiles(a, b, common, shallow=1, use_statcache=0): | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 273 |     """Compare common files in two directories. | 
 | 274 |  | 
| Fred Drake | 2b0d98b | 2000-07-03 08:18:47 +0000 | [diff] [blame] | 275 |     a, b -- directory names | 
 | 276 |     common -- list of file names found in both directories | 
 | 277 |     shallow -- if true, do comparison based solely on stat() information | 
 | 278 |     use_statcache -- if true, use statcache.stat() instead of os.stat() | 
 | 279 |  | 
 | 280 |     Returns a tuple of three lists: | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 281 |       files that compare equal | 
 | 282 |       files that are different | 
| Fred Drake | 2b0d98b | 2000-07-03 08:18:47 +0000 | [diff] [blame] | 283 |       filenames that aren't regular files. | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 284 |  | 
| Fred Drake | 2b0d98b | 2000-07-03 08:18:47 +0000 | [diff] [blame] | 285 |     """ | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 286 |     res = ([], [], []) | 
 | 287 |     for x in common: | 
| Fred Drake | 2b0d98b | 2000-07-03 08:18:47 +0000 | [diff] [blame] | 288 |         ax = os.path.join(a, x) | 
 | 289 |         bx = os.path.join(b, x) | 
 | 290 |         res[_cmp(ax, bx, shallow, use_statcache)].append(x) | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 291 |     return res | 
 | 292 |  | 
 | 293 |  | 
 | 294 | # Compare two files. | 
 | 295 | # Return: | 
| Tim Peters | 88869f9 | 2001-01-14 23:36:06 +0000 | [diff] [blame] | 296 | #       0 for equal | 
 | 297 | #       1 for different | 
 | 298 | #       2 for funny cases (can't stat, etc.) | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 299 | # | 
| Moshe Zadka | afb17fc | 2000-12-03 20:48:07 +0000 | [diff] [blame] | 300 | def _cmp(a, b, sh, st): | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 301 |     try: | 
| Moshe Zadka | afb17fc | 2000-12-03 20:48:07 +0000 | [diff] [blame] | 302 |         return not abs(cmp(a, b, sh, st)) | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 303 |     except os.error: | 
 | 304 |         return 2 | 
 | 305 |  | 
 | 306 |  | 
 | 307 | # Return a copy with items that occur in skip removed. | 
 | 308 | # | 
 | 309 | def _filter(list, skip): | 
 | 310 |     result = [] | 
 | 311 |     for item in list: | 
 | 312 |         if item not in skip: result.append(item) | 
 | 313 |     return result | 
 | 314 |  | 
 | 315 |  | 
 | 316 | # Demonstration and testing. | 
 | 317 | # | 
 | 318 | def demo(): | 
 | 319 |     import sys | 
 | 320 |     import getopt | 
 | 321 |     options, args = getopt.getopt(sys.argv[1:], 'r') | 
| Fred Drake | 8152d32 | 2000-12-12 23:20:45 +0000 | [diff] [blame] | 322 |     if len(args) != 2: | 
 | 323 |         raise getopt.error, 'need exactly two args' | 
| Guido van Rossum | 63b08ac | 2000-06-29 14:13:28 +0000 | [diff] [blame] | 324 |     dd = dircmp(args[0], args[1]) | 
 | 325 |     if ('-r', '') in options: | 
 | 326 |         dd.report_full_closure() | 
 | 327 |     else: | 
 | 328 |         dd.report() | 
 | 329 |  | 
 | 330 | if __name__ == '__main__': | 
 | 331 |     demo() |