blob: 0bc75cfca4d9c9dce9a47759dddb8ec99aa6383a [file] [log] [blame]
epoger@google.com9dddf6f2013-11-08 16:25:25 +00001#!/usr/bin/python
2
3"""
4Copyright 2013 Google Inc.
5
6Use of this source code is governed by a BSD-style license that can be
7found in the LICENSE file.
8
9Calulate differences between image pairs, and store them in a database.
10"""
11
epoger66ed8dc2014-07-17 12:54:16 -070012# System-level imports
epoger@google.com9dddf6f2013-11-08 16:25:25 +000013import contextlib
epoger0b712762014-08-05 10:07:22 -070014import errno
epoger54f1ad82014-07-02 07:43:04 -070015import json
epoger@google.com9dddf6f2013-11-08 16:25:25 +000016import logging
17import os
epoger0b712762014-08-05 10:07:22 -070018import Queue
commit-bot@chromium.org9985ef52014-02-10 18:19:30 +000019import re
epoger@google.com9dddf6f2013-11-08 16:25:25 +000020import shutil
commit-bot@chromium.org44546f82014-02-11 18:21:26 +000021import tempfile
epoger0b712762014-08-05 10:07:22 -070022import threading
23import time
epoger@google.com9dddf6f2013-11-08 16:25:25 +000024import urllib
epoger@google.com9dddf6f2013-11-08 16:25:25 +000025
epoger66ed8dc2014-07-17 12:54:16 -070026# Must fix up PYTHONPATH before importing from within Skia
stephana3b5c86c2014-08-18 13:37:59 -070027import rs_fixpypath # pylint: disable=W0611
epoger66ed8dc2014-07-17 12:54:16 -070028
29# Imports from within Skia
commit-bot@chromium.org44546f82014-02-11 18:21:26 +000030import find_run_binary
epoger0b712762014-08-05 10:07:22 -070031from py.utils import gs_utils
32
commit-bot@chromium.org44546f82014-02-11 18:21:26 +000033
commit-bot@chromium.org4d0f0082014-02-18 14:38:22 +000034SKPDIFF_BINARY = find_run_binary.find_path_to_program('skpdiff')
commit-bot@chromium.org44546f82014-02-11 18:21:26 +000035
rmistry@google.com5861e522013-12-21 19:07:40 +000036DEFAULT_IMAGE_SUFFIX = '.png'
37DEFAULT_IMAGES_SUBDIR = 'images'
epoger0b712762014-08-05 10:07:22 -070038# TODO(epoger): Figure out a better default number of threads; for now,
39# using a conservative default value.
40DEFAULT_NUM_WORKER_THREADS = 1
epoger@google.com9dddf6f2013-11-08 16:25:25 +000041
commit-bot@chromium.org9985ef52014-02-10 18:19:30 +000042DISALLOWED_FILEPATH_CHAR_REGEX = re.compile('[^\w\-]')
43
epoger54f1ad82014-07-02 07:43:04 -070044RGBDIFFS_SUBDIR = 'diffs'
epoger@google.com9dddf6f2013-11-08 16:25:25 +000045WHITEDIFFS_SUBDIR = 'whitediffs'
46
commit-bot@chromium.org16f41802014-02-26 19:05:20 +000047# Keys used within DiffRecord dictionary representations.
48# NOTE: Keep these in sync with static/constants.js
commit-bot@chromium.org68a38152014-05-12 20:40:29 +000049KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL = 'maxDiffPerChannel'
50KEY__DIFFERENCES__NUM_DIFF_PIXELS = 'numDifferingPixels'
51KEY__DIFFERENCES__PERCENT_DIFF_PIXELS = 'percentDifferingPixels'
52KEY__DIFFERENCES__PERCEPTUAL_DIFF = 'perceptualDifference'
stephanaa1aa5c22014-08-15 06:53:23 -070053KEY__DIFFERENCES__DIFF_URL = 'diffUrl'
54KEY__DIFFERENCES__WHITE_DIFF_URL = 'whiteDiffUrl'
commit-bot@chromium.org16f41802014-02-26 19:05:20 +000055
epoger0b712762014-08-05 10:07:22 -070056# Special values within ImageDiffDB._diff_dict
57_DIFFRECORD_FAILED = 'failed'
58_DIFFRECORD_PENDING = 'pending'
59
epoger9c7695b2014-08-12 08:30:09 -070060# How often to report tasks_queue size
61QUEUE_LOGGING_GRANULARITY = 1000
62
epoger0b712762014-08-05 10:07:22 -070063# Temporary variable to keep track of how many times we download
64# the same file in multiple threads.
65# TODO(epoger): Delete this, once we see that the number stays close to 0.
66global_file_collisions = 0
67
epoger@google.com9dddf6f2013-11-08 16:25:25 +000068
69class DiffRecord(object):
70 """ Record of differences between two images. """
71
epoger0b712762014-08-05 10:07:22 -070072 def __init__(self, gs, storage_root,
epoger@google.com9dddf6f2013-11-08 16:25:25 +000073 expected_image_url, expected_image_locator,
rmistry@google.com5861e522013-12-21 19:07:40 +000074 actual_image_url, actual_image_locator,
75 expected_images_subdir=DEFAULT_IMAGES_SUBDIR,
76 actual_images_subdir=DEFAULT_IMAGES_SUBDIR,
77 image_suffix=DEFAULT_IMAGE_SUFFIX):
epoger@google.com9dddf6f2013-11-08 16:25:25 +000078 """Download this pair of images (unless we already have them on local disk),
79 and prepare a DiffRecord for them.
80
epoger@google.com9dddf6f2013-11-08 16:25:25 +000081 Args:
epoger0b712762014-08-05 10:07:22 -070082 gs: instance of GSUtils object we can use to download images
epoger@google.com9dddf6f2013-11-08 16:25:25 +000083 storage_root: root directory on local disk within which we store all
84 images
epoger0b712762014-08-05 10:07:22 -070085 expected_image_url: file, GS, or HTTP url from which we will download the
epoger@google.com9dddf6f2013-11-08 16:25:25 +000086 expected image
87 expected_image_locator: a unique ID string under which we will store the
88 expected image within storage_root (probably including a checksum to
89 guarantee uniqueness)
epoger0b712762014-08-05 10:07:22 -070090 actual_image_url: file, GS, or HTTP url from which we will download the
epoger@google.com9dddf6f2013-11-08 16:25:25 +000091 actual image
92 actual_image_locator: a unique ID string under which we will store the
93 actual image within storage_root (probably including a checksum to
94 guarantee uniqueness)
rmistry@google.com5861e522013-12-21 19:07:40 +000095 expected_images_subdir: the subdirectory expected images are stored in.
96 actual_images_subdir: the subdirectory actual images are stored in.
97 image_suffix: the suffix of images.
epoger@google.com9dddf6f2013-11-08 16:25:25 +000098 """
commit-bot@chromium.org9985ef52014-02-10 18:19:30 +000099 expected_image_locator = _sanitize_locator(expected_image_locator)
100 actual_image_locator = _sanitize_locator(actual_image_locator)
101
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000102 # Download the expected/actual images, if we don't have them already.
commit-bot@chromium.org8cc39a62014-03-04 16:46:22 +0000103 expected_image_file = os.path.join(
104 storage_root, expected_images_subdir,
105 str(expected_image_locator) + image_suffix)
106 actual_image_file = os.path.join(
107 storage_root, actual_images_subdir,
108 str(actual_image_locator) + image_suffix)
epoger3facc7c2014-08-06 10:56:50 -0700109 for image_file, image_url in [
110 (expected_image_file, expected_image_url),
111 (actual_image_file, actual_image_url)]:
112 if image_file and image_url:
113 try:
114 _download_file(gs, image_file, image_url)
115 except Exception:
116 logging.exception('unable to download image_url %s to file %s' %
117 (image_url, image_file))
118 raise
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000119
epoger3facc7c2014-08-06 10:56:50 -0700120 # Return early if we do not need to generate diffs.
121 if (expected_image_url == actual_image_url or
122 not expected_image_url or not actual_image_url):
123 return
124
125 # Get all diff images and values using the skpdiff binary.
epoger54f1ad82014-07-02 07:43:04 -0700126 skpdiff_output_dir = tempfile.mkdtemp()
commit-bot@chromium.org44546f82014-02-11 18:21:26 +0000127 try:
epoger54f1ad82014-07-02 07:43:04 -0700128 skpdiff_summary_file = os.path.join(skpdiff_output_dir,
129 'skpdiff-output.json')
stephana21b342d2014-08-13 10:36:06 -0700130 skpdiff_rgbdiff_dir = os.path.join(storage_root, RGBDIFFS_SUBDIR)
131 skpdiff_whitediff_dir = os.path.join(storage_root, WHITEDIFFS_SUBDIR)
132 _mkdir_unless_exists(skpdiff_rgbdiff_dir)
133 _mkdir_unless_exists(skpdiff_rgbdiff_dir)
epoger54f1ad82014-07-02 07:43:04 -0700134
epoger0b712762014-08-05 10:07:22 -0700135 # TODO(epoger): Consider calling skpdiff ONCE for all image pairs,
136 # instead of calling it separately for each image pair.
137 # Pro: we'll incur less overhead from making repeated system calls,
138 # spinning up the skpdiff binary, etc.
139 # Con: we would have to wait until all image pairs were loaded before
140 # generating any of the diffs?
stephana21b342d2014-08-13 10:36:06 -0700141 # Note(stephana): '--longnames' was added to allow for this
142 # case (multiple files at once) versus specifying output diffs
143 # directly.
commit-bot@chromium.org44546f82014-02-11 18:21:26 +0000144 find_run_binary.run_command(
epoger3facc7c2014-08-06 10:56:50 -0700145 [SKPDIFF_BINARY, '-p', expected_image_file, actual_image_file,
epoger54f1ad82014-07-02 07:43:04 -0700146 '--jsonp', 'false',
stephana21b342d2014-08-13 10:36:06 -0700147 '--longnames', 'true',
epoger54f1ad82014-07-02 07:43:04 -0700148 '--output', skpdiff_summary_file,
149 '--differs', 'perceptual', 'different_pixels',
150 '--rgbDiffDir', skpdiff_rgbdiff_dir,
151 '--whiteDiffDir', skpdiff_whitediff_dir,
152 ])
153
154 # Get information out of the skpdiff_summary_file.
155 with contextlib.closing(open(skpdiff_summary_file)) as fp:
156 data = json.load(fp)
157
158 # For now, we can assume there is only one record in the output summary,
159 # since we passed skpdiff only one pair of images.
160 record = data['records'][0]
161 self._width = record['width']
162 self._height = record['height']
stephanaa1aa5c22014-08-15 06:53:23 -0700163 self._diffUrl = os.path.split(record['rgbDiffPath'])[1]
164 self._whiteDiffUrl = os.path.split(record['whiteDiffPath'])[1]
165
epoger54f1ad82014-07-02 07:43:04 -0700166 # TODO: make max_diff_per_channel a tuple instead of a list, because the
167 # structure is meaningful (first element is red, second is green, etc.)
168 # See http://stackoverflow.com/a/626871
169 self._max_diff_per_channel = [
170 record['maxRedDiff'], record['maxGreenDiff'], record['maxBlueDiff']]
epoger54f1ad82014-07-02 07:43:04 -0700171 per_differ_stats = record['diffs']
172 for stats in per_differ_stats:
173 differ_name = stats['differName']
174 if differ_name == 'different_pixels':
175 self._num_pixels_differing = stats['pointsOfInterest']
176 elif differ_name == 'perceptual':
177 perceptual_similarity = stats['result']
178
179 # skpdiff returns the perceptual similarity; convert it to get the
180 # perceptual difference percentage.
181 # skpdiff outputs -1 if the images are different sizes. Treat any
182 # output that does not lie in [0, 1] as having 0% perceptual
183 # similarity.
184 if not 0 <= perceptual_similarity <= 1:
185 perceptual_similarity = 0
186 self._perceptual_difference = 100 - (perceptual_similarity * 100)
commit-bot@chromium.org44546f82014-02-11 18:21:26 +0000187 finally:
epoger54f1ad82014-07-02 07:43:04 -0700188 shutil.rmtree(skpdiff_output_dir)
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000189
epoger6132b432014-07-09 07:59:06 -0700190 # TODO(epoger): Use properties instead of getters throughout.
191 # See http://stackoverflow.com/a/6618176
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000192 def get_num_pixels_differing(self):
193 """Returns the absolute number of pixels that differ."""
194 return self._num_pixels_differing
195
196 def get_percent_pixels_differing(self):
197 """Returns the percentage of pixels that differ, as a float between
198 0 and 100 (inclusive)."""
199 return ((float(self._num_pixels_differing) * 100) /
200 (self._width * self._height))
201
commit-bot@chromium.org44546f82014-02-11 18:21:26 +0000202 def get_perceptual_difference(self):
203 """Returns the perceptual difference percentage."""
204 return self._perceptual_difference
205
epoger@google.com214a0242013-11-22 19:26:18 +0000206 def get_max_diff_per_channel(self):
207 """Returns the maximum difference between the expected and actual images
208 for each R/G/B channel, as a list."""
209 return self._max_diff_per_channel
210
commit-bot@chromium.org9985ef52014-02-10 18:19:30 +0000211 def as_dict(self):
212 """Returns a dictionary representation of this DiffRecord, as needed when
213 constructing the JSON representation."""
214 return {
commit-bot@chromium.org68a38152014-05-12 20:40:29 +0000215 KEY__DIFFERENCES__NUM_DIFF_PIXELS: self._num_pixels_differing,
216 KEY__DIFFERENCES__PERCENT_DIFF_PIXELS:
commit-bot@chromium.org16f41802014-02-26 19:05:20 +0000217 self.get_percent_pixels_differing(),
commit-bot@chromium.org68a38152014-05-12 20:40:29 +0000218 KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL: self._max_diff_per_channel,
219 KEY__DIFFERENCES__PERCEPTUAL_DIFF: self._perceptual_difference,
stephanaa1aa5c22014-08-15 06:53:23 -0700220 KEY__DIFFERENCES__DIFF_URL: self._diffUrl,
221 KEY__DIFFERENCES__WHITE_DIFF_URL: self._whiteDiffUrl,
commit-bot@chromium.org9985ef52014-02-10 18:19:30 +0000222 }
223
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000224
stephanaa1aa5c22014-08-15 06:53:23 -0700225
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000226class ImageDiffDB(object):
227 """ Calculates differences between image pairs, maintaining a database of
228 them for download."""
229
epoger0b712762014-08-05 10:07:22 -0700230 def __init__(self, storage_root, gs=None,
231 num_worker_threads=DEFAULT_NUM_WORKER_THREADS):
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000232 """
233 Args:
234 storage_root: string; root path within the DB will store all of its stuff
epoger0b712762014-08-05 10:07:22 -0700235 gs: instance of GSUtils object we can use to download images
236 num_worker_threads: how many threads that download images and
237 generate diffs simultaneously
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000238 """
239 self._storage_root = storage_root
epoger0b712762014-08-05 10:07:22 -0700240 self._gs = gs
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000241
epoger9c7695b2014-08-12 08:30:09 -0700242 # Mechanism for reporting queue size periodically.
243 self._last_queue_size_reported = None
244 self._queue_size_report_lock = threading.RLock()
245
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000246 # Dictionary of DiffRecords, keyed by (expected_image_locator,
247 # actual_image_locator) tuples.
epoger0b712762014-08-05 10:07:22 -0700248 # Values can also be _DIFFRECORD_PENDING, _DIFFRECORD_FAILED.
249 #
250 # Any thread that modifies _diff_dict must first acquire
251 # _diff_dict_writelock!
252 #
253 # TODO(epoger): Disk is limitless, but RAM is not... so, we should probably
254 # remove items from self._diff_dict if they haven't been accessed for a
255 # long time. We can always regenerate them by diffing the images we
256 # previously downloaded to local disk.
257 # I guess we should figure out how expensive it is to download vs diff the
258 # image pairs... if diffing them is expensive too, we can write these
259 # _diff_dict objects out to disk if there's too many to hold in RAM.
260 # Or we could use virtual memory to handle that automatically.
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000261 self._diff_dict = {}
epoger0b712762014-08-05 10:07:22 -0700262 self._diff_dict_writelock = threading.RLock()
263
264 # Set up the queue for asynchronously loading DiffRecords, and start the
265 # worker threads reading from it.
epoger3facc7c2014-08-06 10:56:50 -0700266 # The queue maxsize must be 0 (infinite size queue), so that asynchronous
267 # calls can return as soon as possible.
268 self._tasks_queue = Queue.Queue(maxsize=0)
epoger0b712762014-08-05 10:07:22 -0700269 self._workers = []
270 for i in range(num_worker_threads):
271 worker = threading.Thread(target=self.worker, args=(i,))
272 worker.daemon = True
273 worker.start()
274 self._workers.append(worker)
275
epoger9c7695b2014-08-12 08:30:09 -0700276 def log_queue_size_if_changed(self, limit_verbosity=True):
277 """Log the size of self._tasks_queue, if it has changed since the last call.
278
279 Reports the current queue size, using log.info(), unless the queue is the
280 same size as the last time we reported it.
281
282 Args:
283 limit_verbosity: if True, only log if the queue size is a multiple of
284 QUEUE_LOGGING_GRANULARITY
285 """
286 # Acquire the lock, to synchronize access to self._last_queue_size_reported
287 self._queue_size_report_lock.acquire()
288 try:
289 size = self._tasks_queue.qsize()
290 if size == self._last_queue_size_reported:
291 return
292 if limit_verbosity and (size % QUEUE_LOGGING_GRANULARITY != 0):
293 return
294 logging.info('tasks_queue size is %d' % size)
295 self._last_queue_size_reported = size
296 finally:
297 self._queue_size_report_lock.release()
298
epoger0b712762014-08-05 10:07:22 -0700299 def worker(self, worker_num):
300 """Launch a worker thread that pulls tasks off self._tasks_queue.
301
302 Args:
303 worker_num: (integer) which worker this is
304 """
305 while True:
epoger9c7695b2014-08-12 08:30:09 -0700306 self.log_queue_size_if_changed()
epoger0b712762014-08-05 10:07:22 -0700307 params = self._tasks_queue.get()
308 key, expected_image_url, actual_image_url = params
309 try:
310 diff_record = DiffRecord(
311 self._gs, self._storage_root,
312 expected_image_url=expected_image_url,
313 expected_image_locator=key[0],
314 actual_image_url=actual_image_url,
315 actual_image_locator=key[1])
316 except Exception:
317 logging.exception(
318 'exception while creating DiffRecord for key %s' % str(key))
319 diff_record = _DIFFRECORD_FAILED
320 self._diff_dict_writelock.acquire()
321 try:
322 self._diff_dict[key] = diff_record
323 finally:
324 self._diff_dict_writelock.release()
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000325
epoger6132b432014-07-09 07:59:06 -0700326 @property
327 def storage_root(self):
328 return self._storage_root
329
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000330 def add_image_pair(self,
331 expected_image_url, expected_image_locator,
332 actual_image_url, actual_image_locator):
epoger0b712762014-08-05 10:07:22 -0700333 """Asynchronously prepare a DiffRecord for a pair of images.
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000334
epoger0b712762014-08-05 10:07:22 -0700335 This method will return quickly; calls to get_diff_record() will block
336 until the DiffRecord is available (or we have given up on creating it).
337
338 If we already have a DiffRecord for this particular image pair, no work
339 will be done.
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000340
epoger3facc7c2014-08-06 10:56:50 -0700341 If expected_image_url (or its locator) is None, just download actual_image.
342 If actual_image_url (or its locator) is None, just download expected_image.
343
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000344 Args:
epoger0b712762014-08-05 10:07:22 -0700345 expected_image_url: file, GS, or HTTP url from which we will download the
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000346 expected image
347 expected_image_locator: a unique ID string under which we will store the
348 expected image within storage_root (probably including a checksum to
349 guarantee uniqueness)
epoger0b712762014-08-05 10:07:22 -0700350 actual_image_url: file, GS, or HTTP url from which we will download the
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000351 actual image
352 actual_image_locator: a unique ID string under which we will store the
353 actual image within storage_root (probably including a checksum to
354 guarantee uniqueness)
355 """
commit-bot@chromium.orgc9b511f2014-04-15 18:50:12 +0000356 expected_image_locator = _sanitize_locator(expected_image_locator)
357 actual_image_locator = _sanitize_locator(actual_image_locator)
358 key = (expected_image_locator, actual_image_locator)
epoger0b712762014-08-05 10:07:22 -0700359 must_add_to_queue = False
360
361 self._diff_dict_writelock.acquire()
362 try:
363 if not key in self._diff_dict:
364 # If we have already requested a diff between these two images,
365 # we don't need to request it again.
366 must_add_to_queue = True
367 self._diff_dict[key] = _DIFFRECORD_PENDING
368 finally:
369 self._diff_dict_writelock.release()
370
371 if must_add_to_queue:
372 self._tasks_queue.put((key, expected_image_url, actual_image_url))
epoger9c7695b2014-08-12 08:30:09 -0700373 self.log_queue_size_if_changed()
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000374
375 def get_diff_record(self, expected_image_locator, actual_image_locator):
376 """Returns the DiffRecord for this image pair.
377
epoger0b712762014-08-05 10:07:22 -0700378 This call will block until the diff record is available, or we were unable
379 to generate it.
380
381 Args:
382 expected_image_locator: a unique ID string under which we will store the
383 expected image within storage_root (probably including a checksum to
384 guarantee uniqueness)
385 actual_image_locator: a unique ID string under which we will store the
386 actual image within storage_root (probably including a checksum to
387 guarantee uniqueness)
388
389 Returns the DiffRecord for this image pair, or None if we were unable to
390 generate one.
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000391 """
commit-bot@chromium.orgc9b511f2014-04-15 18:50:12 +0000392 key = (_sanitize_locator(expected_image_locator),
393 _sanitize_locator(actual_image_locator))
epoger0b712762014-08-05 10:07:22 -0700394 diff_record = self._diff_dict[key]
395
396 # If we have no results yet, block until we do.
397 while diff_record == _DIFFRECORD_PENDING:
398 time.sleep(1)
399 diff_record = self._diff_dict[key]
400
401 # Once we have the result...
402 if diff_record == _DIFFRECORD_FAILED:
403 logging.error(
404 'failed to create a DiffRecord for expected_image_locator=%s , '
405 'actual_image_locator=%s' % (
406 expected_image_locator, actual_image_locator))
407 return None
408 else:
409 return diff_record
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000410
411
412# Utility functions
413
epoger0b712762014-08-05 10:07:22 -0700414def _download_file(gs, local_filepath, url):
epoger54f1ad82014-07-02 07:43:04 -0700415 """Download a file from url to local_filepath, unless it is already there.
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000416
417 Args:
epoger0b712762014-08-05 10:07:22 -0700418 gs: instance of GSUtils object, in case the url points at Google Storage
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000419 local_filepath: path on local disk where the image should be stored
epoger0b712762014-08-05 10:07:22 -0700420 url: HTTP or GS URL from which we can download the image if we don't have
421 it yet
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000422 """
epoger0b712762014-08-05 10:07:22 -0700423 global global_file_collisions
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000424 if not os.path.exists(local_filepath):
425 _mkdir_unless_exists(os.path.dirname(local_filepath))
epoger0b712762014-08-05 10:07:22 -0700426
427 # First download the file contents into a unique filename, and
428 # then rename that file. That way, if multiple threads are downloading
429 # the same filename at the same time, they won't interfere with each
430 # other (they will both download the file, and one will "win" in the end)
431 temp_filename = '%s-%d' % (local_filepath,
432 threading.current_thread().ident)
433 if gs_utils.GSUtils.is_gs_url(url):
434 (bucket, path) = gs_utils.GSUtils.split_gs_url(url)
435 gs.download_file(source_bucket=bucket, source_path=path,
436 dest_path=temp_filename)
437 else:
438 with contextlib.closing(urllib.urlopen(url)) as url_handle:
439 with open(temp_filename, 'wb') as file_handle:
440 shutil.copyfileobj(fsrc=url_handle, fdst=file_handle)
441
442 # Rename the file to its real filename.
443 # Keep count of how many colliding downloads we encounter;
444 # if it's a large number, we may want to change our download strategy
445 # to minimize repeated downloads.
446 if os.path.exists(local_filepath):
447 global_file_collisions += 1
448 else:
449 os.rename(temp_filename, local_filepath)
epoger@google.com214a0242013-11-22 19:26:18 +0000450
commit-bot@chromium.org16f41802014-02-26 19:05:20 +0000451
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000452def _mkdir_unless_exists(path):
453 """Unless path refers to an already-existing directory, create it.
454
455 Args:
456 path: path on local disk
457 """
epoger0b712762014-08-05 10:07:22 -0700458 try:
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000459 os.makedirs(path)
epoger0b712762014-08-05 10:07:22 -0700460 except OSError as e:
461 if e.errno == errno.EEXIST:
462 pass
epoger@google.com9dddf6f2013-11-08 16:25:25 +0000463
commit-bot@chromium.org16f41802014-02-26 19:05:20 +0000464
commit-bot@chromium.org9985ef52014-02-10 18:19:30 +0000465def _sanitize_locator(locator):
466 """Returns a sanitized version of a locator (one in which we know none of the
467 characters will have special meaning in filenames).
468
469 Args:
epoger3facc7c2014-08-06 10:56:50 -0700470 locator: string, or something that can be represented as a string.
471 If None or '', it is returned without modification, because empty
472 locators have a particular meaning ("there is no image for this")
commit-bot@chromium.org9985ef52014-02-10 18:19:30 +0000473 """
epoger3facc7c2014-08-06 10:56:50 -0700474 if locator:
475 return DISALLOWED_FILEPATH_CHAR_REGEX.sub('_', str(locator))
476 else:
477 return locator