epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 1 | #!/usr/bin/python |
| 2 | |
| 3 | """ |
| 4 | Copyright 2013 Google Inc. |
| 5 | |
| 6 | Use of this source code is governed by a BSD-style license that can be |
| 7 | found in the LICENSE file. |
| 8 | |
| 9 | Calulate differences between image pairs, and store them in a database. |
| 10 | """ |
| 11 | |
epoger | 66ed8dc | 2014-07-17 12:54:16 -0700 | [diff] [blame] | 12 | # System-level imports |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 13 | import contextlib |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 14 | import errno |
epoger | 54f1ad8 | 2014-07-02 07:43:04 -0700 | [diff] [blame] | 15 | import json |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 16 | import logging |
| 17 | import os |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 18 | import Queue |
commit-bot@chromium.org | 9985ef5 | 2014-02-10 18:19:30 +0000 | [diff] [blame] | 19 | import re |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 20 | import shutil |
commit-bot@chromium.org | 44546f8 | 2014-02-11 18:21:26 +0000 | [diff] [blame] | 21 | import tempfile |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 22 | import threading |
| 23 | import time |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 24 | import urllib |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 25 | |
epoger | 66ed8dc | 2014-07-17 12:54:16 -0700 | [diff] [blame] | 26 | # Must fix up PYTHONPATH before importing from within Skia |
stephana | 3b5c86c | 2014-08-18 13:37:59 -0700 | [diff] [blame] | 27 | import rs_fixpypath # pylint: disable=W0611 |
epoger | 66ed8dc | 2014-07-17 12:54:16 -0700 | [diff] [blame] | 28 | |
| 29 | # Imports from within Skia |
commit-bot@chromium.org | 44546f8 | 2014-02-11 18:21:26 +0000 | [diff] [blame] | 30 | import find_run_binary |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 31 | from py.utils import gs_utils |
| 32 | |
commit-bot@chromium.org | 44546f8 | 2014-02-11 18:21:26 +0000 | [diff] [blame] | 33 | |
commit-bot@chromium.org | 4d0f008 | 2014-02-18 14:38:22 +0000 | [diff] [blame] | 34 | SKPDIFF_BINARY = find_run_binary.find_path_to_program('skpdiff') |
commit-bot@chromium.org | 44546f8 | 2014-02-11 18:21:26 +0000 | [diff] [blame] | 35 | |
rmistry@google.com | 5861e52 | 2013-12-21 19:07:40 +0000 | [diff] [blame] | 36 | DEFAULT_IMAGE_SUFFIX = '.png' |
| 37 | DEFAULT_IMAGES_SUBDIR = 'images' |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 38 | # TODO(epoger): Figure out a better default number of threads; for now, |
| 39 | # using a conservative default value. |
| 40 | DEFAULT_NUM_WORKER_THREADS = 1 |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 41 | |
commit-bot@chromium.org | 9985ef5 | 2014-02-10 18:19:30 +0000 | [diff] [blame] | 42 | DISALLOWED_FILEPATH_CHAR_REGEX = re.compile('[^\w\-]') |
| 43 | |
epoger | 54f1ad8 | 2014-07-02 07:43:04 -0700 | [diff] [blame] | 44 | RGBDIFFS_SUBDIR = 'diffs' |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 45 | WHITEDIFFS_SUBDIR = 'whitediffs' |
| 46 | |
commit-bot@chromium.org | 16f4180 | 2014-02-26 19:05:20 +0000 | [diff] [blame] | 47 | # Keys used within DiffRecord dictionary representations. |
| 48 | # NOTE: Keep these in sync with static/constants.js |
commit-bot@chromium.org | 68a3815 | 2014-05-12 20:40:29 +0000 | [diff] [blame] | 49 | KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL = 'maxDiffPerChannel' |
| 50 | KEY__DIFFERENCES__NUM_DIFF_PIXELS = 'numDifferingPixels' |
| 51 | KEY__DIFFERENCES__PERCENT_DIFF_PIXELS = 'percentDifferingPixels' |
| 52 | KEY__DIFFERENCES__PERCEPTUAL_DIFF = 'perceptualDifference' |
stephana | a1aa5c2 | 2014-08-15 06:53:23 -0700 | [diff] [blame] | 53 | KEY__DIFFERENCES__DIFF_URL = 'diffUrl' |
| 54 | KEY__DIFFERENCES__WHITE_DIFF_URL = 'whiteDiffUrl' |
commit-bot@chromium.org | 16f4180 | 2014-02-26 19:05:20 +0000 | [diff] [blame] | 55 | |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 56 | # Special values within ImageDiffDB._diff_dict |
| 57 | _DIFFRECORD_FAILED = 'failed' |
| 58 | _DIFFRECORD_PENDING = 'pending' |
| 59 | |
epoger | 9c7695b | 2014-08-12 08:30:09 -0700 | [diff] [blame] | 60 | # How often to report tasks_queue size |
| 61 | QUEUE_LOGGING_GRANULARITY = 1000 |
| 62 | |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 63 | # Temporary variable to keep track of how many times we download |
| 64 | # the same file in multiple threads. |
| 65 | # TODO(epoger): Delete this, once we see that the number stays close to 0. |
| 66 | global_file_collisions = 0 |
| 67 | |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 68 | |
| 69 | class DiffRecord(object): |
| 70 | """ Record of differences between two images. """ |
| 71 | |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 72 | def __init__(self, gs, storage_root, |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 73 | expected_image_url, expected_image_locator, |
rmistry@google.com | 5861e52 | 2013-12-21 19:07:40 +0000 | [diff] [blame] | 74 | actual_image_url, actual_image_locator, |
| 75 | expected_images_subdir=DEFAULT_IMAGES_SUBDIR, |
| 76 | actual_images_subdir=DEFAULT_IMAGES_SUBDIR, |
| 77 | image_suffix=DEFAULT_IMAGE_SUFFIX): |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 78 | """Download this pair of images (unless we already have them on local disk), |
| 79 | and prepare a DiffRecord for them. |
| 80 | |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 81 | Args: |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 82 | gs: instance of GSUtils object we can use to download images |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 83 | storage_root: root directory on local disk within which we store all |
| 84 | images |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 85 | expected_image_url: file, GS, or HTTP url from which we will download the |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 86 | expected image |
| 87 | expected_image_locator: a unique ID string under which we will store the |
| 88 | expected image within storage_root (probably including a checksum to |
| 89 | guarantee uniqueness) |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 90 | actual_image_url: file, GS, or HTTP url from which we will download the |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 91 | actual image |
| 92 | actual_image_locator: a unique ID string under which we will store the |
| 93 | actual image within storage_root (probably including a checksum to |
| 94 | guarantee uniqueness) |
rmistry@google.com | 5861e52 | 2013-12-21 19:07:40 +0000 | [diff] [blame] | 95 | expected_images_subdir: the subdirectory expected images are stored in. |
| 96 | actual_images_subdir: the subdirectory actual images are stored in. |
| 97 | image_suffix: the suffix of images. |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 98 | """ |
commit-bot@chromium.org | 9985ef5 | 2014-02-10 18:19:30 +0000 | [diff] [blame] | 99 | expected_image_locator = _sanitize_locator(expected_image_locator) |
| 100 | actual_image_locator = _sanitize_locator(actual_image_locator) |
| 101 | |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 102 | # Download the expected/actual images, if we don't have them already. |
commit-bot@chromium.org | 8cc39a6 | 2014-03-04 16:46:22 +0000 | [diff] [blame] | 103 | expected_image_file = os.path.join( |
| 104 | storage_root, expected_images_subdir, |
| 105 | str(expected_image_locator) + image_suffix) |
| 106 | actual_image_file = os.path.join( |
| 107 | storage_root, actual_images_subdir, |
| 108 | str(actual_image_locator) + image_suffix) |
epoger | 3facc7c | 2014-08-06 10:56:50 -0700 | [diff] [blame] | 109 | for image_file, image_url in [ |
| 110 | (expected_image_file, expected_image_url), |
| 111 | (actual_image_file, actual_image_url)]: |
| 112 | if image_file and image_url: |
| 113 | try: |
| 114 | _download_file(gs, image_file, image_url) |
| 115 | except Exception: |
| 116 | logging.exception('unable to download image_url %s to file %s' % |
| 117 | (image_url, image_file)) |
| 118 | raise |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 119 | |
epoger | 3facc7c | 2014-08-06 10:56:50 -0700 | [diff] [blame] | 120 | # Return early if we do not need to generate diffs. |
| 121 | if (expected_image_url == actual_image_url or |
| 122 | not expected_image_url or not actual_image_url): |
| 123 | return |
| 124 | |
| 125 | # Get all diff images and values using the skpdiff binary. |
epoger | 54f1ad8 | 2014-07-02 07:43:04 -0700 | [diff] [blame] | 126 | skpdiff_output_dir = tempfile.mkdtemp() |
commit-bot@chromium.org | 44546f8 | 2014-02-11 18:21:26 +0000 | [diff] [blame] | 127 | try: |
epoger | 54f1ad8 | 2014-07-02 07:43:04 -0700 | [diff] [blame] | 128 | skpdiff_summary_file = os.path.join(skpdiff_output_dir, |
| 129 | 'skpdiff-output.json') |
stephana | 21b342d | 2014-08-13 10:36:06 -0700 | [diff] [blame] | 130 | skpdiff_rgbdiff_dir = os.path.join(storage_root, RGBDIFFS_SUBDIR) |
| 131 | skpdiff_whitediff_dir = os.path.join(storage_root, WHITEDIFFS_SUBDIR) |
| 132 | _mkdir_unless_exists(skpdiff_rgbdiff_dir) |
| 133 | _mkdir_unless_exists(skpdiff_rgbdiff_dir) |
epoger | 54f1ad8 | 2014-07-02 07:43:04 -0700 | [diff] [blame] | 134 | |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 135 | # TODO(epoger): Consider calling skpdiff ONCE for all image pairs, |
| 136 | # instead of calling it separately for each image pair. |
| 137 | # Pro: we'll incur less overhead from making repeated system calls, |
| 138 | # spinning up the skpdiff binary, etc. |
| 139 | # Con: we would have to wait until all image pairs were loaded before |
| 140 | # generating any of the diffs? |
stephana | 21b342d | 2014-08-13 10:36:06 -0700 | [diff] [blame] | 141 | # Note(stephana): '--longnames' was added to allow for this |
| 142 | # case (multiple files at once) versus specifying output diffs |
| 143 | # directly. |
commit-bot@chromium.org | 44546f8 | 2014-02-11 18:21:26 +0000 | [diff] [blame] | 144 | find_run_binary.run_command( |
epoger | 3facc7c | 2014-08-06 10:56:50 -0700 | [diff] [blame] | 145 | [SKPDIFF_BINARY, '-p', expected_image_file, actual_image_file, |
epoger | 54f1ad8 | 2014-07-02 07:43:04 -0700 | [diff] [blame] | 146 | '--jsonp', 'false', |
stephana | 21b342d | 2014-08-13 10:36:06 -0700 | [diff] [blame] | 147 | '--longnames', 'true', |
epoger | 54f1ad8 | 2014-07-02 07:43:04 -0700 | [diff] [blame] | 148 | '--output', skpdiff_summary_file, |
| 149 | '--differs', 'perceptual', 'different_pixels', |
| 150 | '--rgbDiffDir', skpdiff_rgbdiff_dir, |
| 151 | '--whiteDiffDir', skpdiff_whitediff_dir, |
| 152 | ]) |
| 153 | |
| 154 | # Get information out of the skpdiff_summary_file. |
| 155 | with contextlib.closing(open(skpdiff_summary_file)) as fp: |
| 156 | data = json.load(fp) |
| 157 | |
| 158 | # For now, we can assume there is only one record in the output summary, |
| 159 | # since we passed skpdiff only one pair of images. |
| 160 | record = data['records'][0] |
| 161 | self._width = record['width'] |
| 162 | self._height = record['height'] |
stephana | a1aa5c2 | 2014-08-15 06:53:23 -0700 | [diff] [blame] | 163 | self._diffUrl = os.path.split(record['rgbDiffPath'])[1] |
| 164 | self._whiteDiffUrl = os.path.split(record['whiteDiffPath'])[1] |
| 165 | |
epoger | 54f1ad8 | 2014-07-02 07:43:04 -0700 | [diff] [blame] | 166 | # TODO: make max_diff_per_channel a tuple instead of a list, because the |
| 167 | # structure is meaningful (first element is red, second is green, etc.) |
| 168 | # See http://stackoverflow.com/a/626871 |
| 169 | self._max_diff_per_channel = [ |
| 170 | record['maxRedDiff'], record['maxGreenDiff'], record['maxBlueDiff']] |
epoger | 54f1ad8 | 2014-07-02 07:43:04 -0700 | [diff] [blame] | 171 | per_differ_stats = record['diffs'] |
| 172 | for stats in per_differ_stats: |
| 173 | differ_name = stats['differName'] |
| 174 | if differ_name == 'different_pixels': |
| 175 | self._num_pixels_differing = stats['pointsOfInterest'] |
| 176 | elif differ_name == 'perceptual': |
| 177 | perceptual_similarity = stats['result'] |
| 178 | |
| 179 | # skpdiff returns the perceptual similarity; convert it to get the |
| 180 | # perceptual difference percentage. |
| 181 | # skpdiff outputs -1 if the images are different sizes. Treat any |
| 182 | # output that does not lie in [0, 1] as having 0% perceptual |
| 183 | # similarity. |
| 184 | if not 0 <= perceptual_similarity <= 1: |
| 185 | perceptual_similarity = 0 |
| 186 | self._perceptual_difference = 100 - (perceptual_similarity * 100) |
commit-bot@chromium.org | 44546f8 | 2014-02-11 18:21:26 +0000 | [diff] [blame] | 187 | finally: |
epoger | 54f1ad8 | 2014-07-02 07:43:04 -0700 | [diff] [blame] | 188 | shutil.rmtree(skpdiff_output_dir) |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 189 | |
epoger | 6132b43 | 2014-07-09 07:59:06 -0700 | [diff] [blame] | 190 | # TODO(epoger): Use properties instead of getters throughout. |
| 191 | # See http://stackoverflow.com/a/6618176 |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 192 | def get_num_pixels_differing(self): |
| 193 | """Returns the absolute number of pixels that differ.""" |
| 194 | return self._num_pixels_differing |
| 195 | |
| 196 | def get_percent_pixels_differing(self): |
| 197 | """Returns the percentage of pixels that differ, as a float between |
| 198 | 0 and 100 (inclusive).""" |
| 199 | return ((float(self._num_pixels_differing) * 100) / |
| 200 | (self._width * self._height)) |
| 201 | |
commit-bot@chromium.org | 44546f8 | 2014-02-11 18:21:26 +0000 | [diff] [blame] | 202 | def get_perceptual_difference(self): |
| 203 | """Returns the perceptual difference percentage.""" |
| 204 | return self._perceptual_difference |
| 205 | |
epoger@google.com | 214a024 | 2013-11-22 19:26:18 +0000 | [diff] [blame] | 206 | def get_max_diff_per_channel(self): |
| 207 | """Returns the maximum difference between the expected and actual images |
| 208 | for each R/G/B channel, as a list.""" |
| 209 | return self._max_diff_per_channel |
| 210 | |
commit-bot@chromium.org | 9985ef5 | 2014-02-10 18:19:30 +0000 | [diff] [blame] | 211 | def as_dict(self): |
| 212 | """Returns a dictionary representation of this DiffRecord, as needed when |
| 213 | constructing the JSON representation.""" |
| 214 | return { |
commit-bot@chromium.org | 68a3815 | 2014-05-12 20:40:29 +0000 | [diff] [blame] | 215 | KEY__DIFFERENCES__NUM_DIFF_PIXELS: self._num_pixels_differing, |
| 216 | KEY__DIFFERENCES__PERCENT_DIFF_PIXELS: |
commit-bot@chromium.org | 16f4180 | 2014-02-26 19:05:20 +0000 | [diff] [blame] | 217 | self.get_percent_pixels_differing(), |
commit-bot@chromium.org | 68a3815 | 2014-05-12 20:40:29 +0000 | [diff] [blame] | 218 | KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL: self._max_diff_per_channel, |
| 219 | KEY__DIFFERENCES__PERCEPTUAL_DIFF: self._perceptual_difference, |
stephana | a1aa5c2 | 2014-08-15 06:53:23 -0700 | [diff] [blame] | 220 | KEY__DIFFERENCES__DIFF_URL: self._diffUrl, |
| 221 | KEY__DIFFERENCES__WHITE_DIFF_URL: self._whiteDiffUrl, |
commit-bot@chromium.org | 9985ef5 | 2014-02-10 18:19:30 +0000 | [diff] [blame] | 222 | } |
| 223 | |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 224 | |
stephana | a1aa5c2 | 2014-08-15 06:53:23 -0700 | [diff] [blame] | 225 | |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 226 | class ImageDiffDB(object): |
| 227 | """ Calculates differences between image pairs, maintaining a database of |
| 228 | them for download.""" |
| 229 | |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 230 | def __init__(self, storage_root, gs=None, |
| 231 | num_worker_threads=DEFAULT_NUM_WORKER_THREADS): |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 232 | """ |
| 233 | Args: |
| 234 | storage_root: string; root path within the DB will store all of its stuff |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 235 | gs: instance of GSUtils object we can use to download images |
| 236 | num_worker_threads: how many threads that download images and |
| 237 | generate diffs simultaneously |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 238 | """ |
| 239 | self._storage_root = storage_root |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 240 | self._gs = gs |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 241 | |
epoger | 9c7695b | 2014-08-12 08:30:09 -0700 | [diff] [blame] | 242 | # Mechanism for reporting queue size periodically. |
| 243 | self._last_queue_size_reported = None |
| 244 | self._queue_size_report_lock = threading.RLock() |
| 245 | |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 246 | # Dictionary of DiffRecords, keyed by (expected_image_locator, |
| 247 | # actual_image_locator) tuples. |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 248 | # Values can also be _DIFFRECORD_PENDING, _DIFFRECORD_FAILED. |
| 249 | # |
| 250 | # Any thread that modifies _diff_dict must first acquire |
| 251 | # _diff_dict_writelock! |
| 252 | # |
| 253 | # TODO(epoger): Disk is limitless, but RAM is not... so, we should probably |
| 254 | # remove items from self._diff_dict if they haven't been accessed for a |
| 255 | # long time. We can always regenerate them by diffing the images we |
| 256 | # previously downloaded to local disk. |
| 257 | # I guess we should figure out how expensive it is to download vs diff the |
| 258 | # image pairs... if diffing them is expensive too, we can write these |
| 259 | # _diff_dict objects out to disk if there's too many to hold in RAM. |
| 260 | # Or we could use virtual memory to handle that automatically. |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 261 | self._diff_dict = {} |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 262 | self._diff_dict_writelock = threading.RLock() |
| 263 | |
| 264 | # Set up the queue for asynchronously loading DiffRecords, and start the |
| 265 | # worker threads reading from it. |
epoger | 3facc7c | 2014-08-06 10:56:50 -0700 | [diff] [blame] | 266 | # The queue maxsize must be 0 (infinite size queue), so that asynchronous |
| 267 | # calls can return as soon as possible. |
| 268 | self._tasks_queue = Queue.Queue(maxsize=0) |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 269 | self._workers = [] |
| 270 | for i in range(num_worker_threads): |
| 271 | worker = threading.Thread(target=self.worker, args=(i,)) |
| 272 | worker.daemon = True |
| 273 | worker.start() |
| 274 | self._workers.append(worker) |
| 275 | |
epoger | 9c7695b | 2014-08-12 08:30:09 -0700 | [diff] [blame] | 276 | def log_queue_size_if_changed(self, limit_verbosity=True): |
| 277 | """Log the size of self._tasks_queue, if it has changed since the last call. |
| 278 | |
| 279 | Reports the current queue size, using log.info(), unless the queue is the |
| 280 | same size as the last time we reported it. |
| 281 | |
| 282 | Args: |
| 283 | limit_verbosity: if True, only log if the queue size is a multiple of |
| 284 | QUEUE_LOGGING_GRANULARITY |
| 285 | """ |
| 286 | # Acquire the lock, to synchronize access to self._last_queue_size_reported |
| 287 | self._queue_size_report_lock.acquire() |
| 288 | try: |
| 289 | size = self._tasks_queue.qsize() |
| 290 | if size == self._last_queue_size_reported: |
| 291 | return |
| 292 | if limit_verbosity and (size % QUEUE_LOGGING_GRANULARITY != 0): |
| 293 | return |
| 294 | logging.info('tasks_queue size is %d' % size) |
| 295 | self._last_queue_size_reported = size |
| 296 | finally: |
| 297 | self._queue_size_report_lock.release() |
| 298 | |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 299 | def worker(self, worker_num): |
| 300 | """Launch a worker thread that pulls tasks off self._tasks_queue. |
| 301 | |
| 302 | Args: |
| 303 | worker_num: (integer) which worker this is |
| 304 | """ |
| 305 | while True: |
epoger | 9c7695b | 2014-08-12 08:30:09 -0700 | [diff] [blame] | 306 | self.log_queue_size_if_changed() |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 307 | params = self._tasks_queue.get() |
| 308 | key, expected_image_url, actual_image_url = params |
| 309 | try: |
| 310 | diff_record = DiffRecord( |
| 311 | self._gs, self._storage_root, |
| 312 | expected_image_url=expected_image_url, |
| 313 | expected_image_locator=key[0], |
| 314 | actual_image_url=actual_image_url, |
| 315 | actual_image_locator=key[1]) |
| 316 | except Exception: |
| 317 | logging.exception( |
| 318 | 'exception while creating DiffRecord for key %s' % str(key)) |
| 319 | diff_record = _DIFFRECORD_FAILED |
| 320 | self._diff_dict_writelock.acquire() |
| 321 | try: |
| 322 | self._diff_dict[key] = diff_record |
| 323 | finally: |
| 324 | self._diff_dict_writelock.release() |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 325 | |
epoger | 6132b43 | 2014-07-09 07:59:06 -0700 | [diff] [blame] | 326 | @property |
| 327 | def storage_root(self): |
| 328 | return self._storage_root |
| 329 | |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 330 | def add_image_pair(self, |
| 331 | expected_image_url, expected_image_locator, |
| 332 | actual_image_url, actual_image_locator): |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 333 | """Asynchronously prepare a DiffRecord for a pair of images. |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 334 | |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 335 | This method will return quickly; calls to get_diff_record() will block |
| 336 | until the DiffRecord is available (or we have given up on creating it). |
| 337 | |
| 338 | If we already have a DiffRecord for this particular image pair, no work |
| 339 | will be done. |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 340 | |
epoger | 3facc7c | 2014-08-06 10:56:50 -0700 | [diff] [blame] | 341 | If expected_image_url (or its locator) is None, just download actual_image. |
| 342 | If actual_image_url (or its locator) is None, just download expected_image. |
| 343 | |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 344 | Args: |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 345 | expected_image_url: file, GS, or HTTP url from which we will download the |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 346 | expected image |
| 347 | expected_image_locator: a unique ID string under which we will store the |
| 348 | expected image within storage_root (probably including a checksum to |
| 349 | guarantee uniqueness) |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 350 | actual_image_url: file, GS, or HTTP url from which we will download the |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 351 | actual image |
| 352 | actual_image_locator: a unique ID string under which we will store the |
| 353 | actual image within storage_root (probably including a checksum to |
| 354 | guarantee uniqueness) |
| 355 | """ |
commit-bot@chromium.org | c9b511f | 2014-04-15 18:50:12 +0000 | [diff] [blame] | 356 | expected_image_locator = _sanitize_locator(expected_image_locator) |
| 357 | actual_image_locator = _sanitize_locator(actual_image_locator) |
| 358 | key = (expected_image_locator, actual_image_locator) |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 359 | must_add_to_queue = False |
| 360 | |
| 361 | self._diff_dict_writelock.acquire() |
| 362 | try: |
| 363 | if not key in self._diff_dict: |
| 364 | # If we have already requested a diff between these two images, |
| 365 | # we don't need to request it again. |
| 366 | must_add_to_queue = True |
| 367 | self._diff_dict[key] = _DIFFRECORD_PENDING |
| 368 | finally: |
| 369 | self._diff_dict_writelock.release() |
| 370 | |
| 371 | if must_add_to_queue: |
| 372 | self._tasks_queue.put((key, expected_image_url, actual_image_url)) |
epoger | 9c7695b | 2014-08-12 08:30:09 -0700 | [diff] [blame] | 373 | self.log_queue_size_if_changed() |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 374 | |
| 375 | def get_diff_record(self, expected_image_locator, actual_image_locator): |
| 376 | """Returns the DiffRecord for this image pair. |
| 377 | |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 378 | This call will block until the diff record is available, or we were unable |
| 379 | to generate it. |
| 380 | |
| 381 | Args: |
| 382 | expected_image_locator: a unique ID string under which we will store the |
| 383 | expected image within storage_root (probably including a checksum to |
| 384 | guarantee uniqueness) |
| 385 | actual_image_locator: a unique ID string under which we will store the |
| 386 | actual image within storage_root (probably including a checksum to |
| 387 | guarantee uniqueness) |
| 388 | |
| 389 | Returns the DiffRecord for this image pair, or None if we were unable to |
| 390 | generate one. |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 391 | """ |
commit-bot@chromium.org | c9b511f | 2014-04-15 18:50:12 +0000 | [diff] [blame] | 392 | key = (_sanitize_locator(expected_image_locator), |
| 393 | _sanitize_locator(actual_image_locator)) |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 394 | diff_record = self._diff_dict[key] |
| 395 | |
| 396 | # If we have no results yet, block until we do. |
| 397 | while diff_record == _DIFFRECORD_PENDING: |
| 398 | time.sleep(1) |
| 399 | diff_record = self._diff_dict[key] |
| 400 | |
| 401 | # Once we have the result... |
| 402 | if diff_record == _DIFFRECORD_FAILED: |
| 403 | logging.error( |
| 404 | 'failed to create a DiffRecord for expected_image_locator=%s , ' |
| 405 | 'actual_image_locator=%s' % ( |
| 406 | expected_image_locator, actual_image_locator)) |
| 407 | return None |
| 408 | else: |
| 409 | return diff_record |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 410 | |
| 411 | |
| 412 | # Utility functions |
| 413 | |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 414 | def _download_file(gs, local_filepath, url): |
epoger | 54f1ad8 | 2014-07-02 07:43:04 -0700 | [diff] [blame] | 415 | """Download a file from url to local_filepath, unless it is already there. |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 416 | |
| 417 | Args: |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 418 | gs: instance of GSUtils object, in case the url points at Google Storage |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 419 | local_filepath: path on local disk where the image should be stored |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 420 | url: HTTP or GS URL from which we can download the image if we don't have |
| 421 | it yet |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 422 | """ |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 423 | global global_file_collisions |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 424 | if not os.path.exists(local_filepath): |
| 425 | _mkdir_unless_exists(os.path.dirname(local_filepath)) |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 426 | |
| 427 | # First download the file contents into a unique filename, and |
| 428 | # then rename that file. That way, if multiple threads are downloading |
| 429 | # the same filename at the same time, they won't interfere with each |
| 430 | # other (they will both download the file, and one will "win" in the end) |
| 431 | temp_filename = '%s-%d' % (local_filepath, |
| 432 | threading.current_thread().ident) |
| 433 | if gs_utils.GSUtils.is_gs_url(url): |
| 434 | (bucket, path) = gs_utils.GSUtils.split_gs_url(url) |
| 435 | gs.download_file(source_bucket=bucket, source_path=path, |
| 436 | dest_path=temp_filename) |
| 437 | else: |
| 438 | with contextlib.closing(urllib.urlopen(url)) as url_handle: |
| 439 | with open(temp_filename, 'wb') as file_handle: |
| 440 | shutil.copyfileobj(fsrc=url_handle, fdst=file_handle) |
| 441 | |
| 442 | # Rename the file to its real filename. |
| 443 | # Keep count of how many colliding downloads we encounter; |
| 444 | # if it's a large number, we may want to change our download strategy |
| 445 | # to minimize repeated downloads. |
| 446 | if os.path.exists(local_filepath): |
| 447 | global_file_collisions += 1 |
| 448 | else: |
| 449 | os.rename(temp_filename, local_filepath) |
epoger@google.com | 214a024 | 2013-11-22 19:26:18 +0000 | [diff] [blame] | 450 | |
commit-bot@chromium.org | 16f4180 | 2014-02-26 19:05:20 +0000 | [diff] [blame] | 451 | |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 452 | def _mkdir_unless_exists(path): |
| 453 | """Unless path refers to an already-existing directory, create it. |
| 454 | |
| 455 | Args: |
| 456 | path: path on local disk |
| 457 | """ |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 458 | try: |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 459 | os.makedirs(path) |
epoger | 0b71276 | 2014-08-05 10:07:22 -0700 | [diff] [blame] | 460 | except OSError as e: |
| 461 | if e.errno == errno.EEXIST: |
| 462 | pass |
epoger@google.com | 9dddf6f | 2013-11-08 16:25:25 +0000 | [diff] [blame] | 463 | |
commit-bot@chromium.org | 16f4180 | 2014-02-26 19:05:20 +0000 | [diff] [blame] | 464 | |
commit-bot@chromium.org | 9985ef5 | 2014-02-10 18:19:30 +0000 | [diff] [blame] | 465 | def _sanitize_locator(locator): |
| 466 | """Returns a sanitized version of a locator (one in which we know none of the |
| 467 | characters will have special meaning in filenames). |
| 468 | |
| 469 | Args: |
epoger | 3facc7c | 2014-08-06 10:56:50 -0700 | [diff] [blame] | 470 | locator: string, or something that can be represented as a string. |
| 471 | If None or '', it is returned without modification, because empty |
| 472 | locators have a particular meaning ("there is no image for this") |
commit-bot@chromium.org | 9985ef5 | 2014-02-10 18:19:30 +0000 | [diff] [blame] | 473 | """ |
epoger | 3facc7c | 2014-08-06 10:56:50 -0700 | [diff] [blame] | 474 | if locator: |
| 475 | return DISALLOWED_FILEPATH_CHAR_REGEX.sub('_', str(locator)) |
| 476 | else: |
| 477 | return locator |