gm/rebaseline_server/imagediffdb.py - platform/external/skqp - Gitiles

 #!/usr/bin/python

 """
 Copyright 2013 Google Inc.

 Use of this source code is governed by a BSD-style license that can be
 found in the LICENSE file.

 Calulate differences between image pairs, and store them in a database.
 """

 # System-level imports
 import contextlib
 import errno
 import json
 import logging
 import os
 import Queue
 import re
 import shutil
 import tempfile
 import threading
 import time
 import urllib

 # Must fix up PYTHONPATH before importing from within Skia
 import rs_fixpypath  # pylint: disable=W0611

 # Imports from within Skia
 import find_run_binary
 from py.utils import gs_utils


 SKPDIFF_BINARY = find_run_binary.find_path_to_program('skpdiff')

 DEFAULT_IMAGE_SUFFIX = '.png'
 DEFAULT_IMAGES_SUBDIR = 'images'
 # TODO(epoger): Figure out a better default number of threads; for now,
 # using a conservative default value.
 DEFAULT_NUM_WORKER_THREADS = 1

 DISALLOWED_FILEPATH_CHAR_REGEX = re.compile('[^\w\-]')

 RGBDIFFS_SUBDIR = 'diffs'
 WHITEDIFFS_SUBDIR = 'whitediffs'

 # Keys used within DiffRecord dictionary representations.
 # NOTE: Keep these in sync with static/constants.js
 KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL = 'maxDiffPerChannel'
 KEY__DIFFERENCES__NUM_DIFF_PIXELS = 'numDifferingPixels'
 KEY__DIFFERENCES__PERCENT_DIFF_PIXELS = 'percentDifferingPixels'
 KEY__DIFFERENCES__PERCEPTUAL_DIFF = 'perceptualDifference'
 KEY__DIFFERENCES__DIFF_URL = 'diffUrl'
 KEY__DIFFERENCES__WHITE_DIFF_URL = 'whiteDiffUrl'

 # Special values within ImageDiffDB._diff_dict
 _DIFFRECORD_FAILED = 'failed'
 _DIFFRECORD_PENDING = 'pending'

 # How often to report tasks_queue size
 QUEUE_LOGGING_GRANULARITY = 1000

 # Temporary variable to keep track of how many times we download
 # the same file in multiple threads.
 # TODO(epoger): Delete this, once we see that the number stays close to 0.
 global_file_collisions = 0


 class DiffRecord(object):
   """ Record of differences between two images. """

   def __init__(self, gs, storage_root,
                expected_image_url, expected_image_locator,
                actual_image_url, actual_image_locator,
                expected_images_subdir=DEFAULT_IMAGES_SUBDIR,
                actual_images_subdir=DEFAULT_IMAGES_SUBDIR,
                image_suffix=DEFAULT_IMAGE_SUFFIX):
     """Download this pair of images (unless we already have them on local disk),
     and prepare a DiffRecord for them.

     Args:
       gs: instance of GSUtils object we can use to download images
       storage_root: root directory on local disk within which we store all
           images
       expected_image_url: file, GS, or HTTP url from which we will download the
           expected image
       expected_image_locator: a unique ID string under which we will store the
           expected image within storage_root (probably including a checksum to
           guarantee uniqueness)
       actual_image_url: file, GS, or HTTP url from which we will download the
           actual image
       actual_image_locator: a unique ID string under which we will store the
           actual image within storage_root (probably including a checksum to
           guarantee uniqueness)
       expected_images_subdir: the subdirectory expected images are stored in.
       actual_images_subdir: the subdirectory actual images are stored in.
       image_suffix: the suffix of images.
     """
     expected_image_locator = _sanitize_locator(expected_image_locator)
     actual_image_locator = _sanitize_locator(actual_image_locator)

     # Download the expected/actual images, if we don't have them already.
     expected_image_file = os.path.join(
         storage_root, expected_images_subdir,
         str(expected_image_locator) + image_suffix)
     actual_image_file = os.path.join(
         storage_root, actual_images_subdir,
         str(actual_image_locator) + image_suffix)
     for image_file, image_url in [
         (expected_image_file, expected_image_url),
         (actual_image_file, actual_image_url)]:
       if image_file and image_url:
         try:
           _download_file(gs, image_file, image_url)
         except Exception:
           logging.exception('unable to download image_url %s to file %s' %
                             (image_url, image_file))
           raise

     # Return early if we do not need to generate diffs.
     if (expected_image_url == actual_image_url or
         not expected_image_url or not actual_image_url):
       return

     # Get all diff images and values using the skpdiff binary.
     skpdiff_output_dir = tempfile.mkdtemp()
     try:
       skpdiff_summary_file = os.path.join(skpdiff_output_dir,
                                           'skpdiff-output.json')
       skpdiff_rgbdiff_dir = os.path.join(storage_root, RGBDIFFS_SUBDIR)
       skpdiff_whitediff_dir = os.path.join(storage_root, WHITEDIFFS_SUBDIR)
       _mkdir_unless_exists(skpdiff_rgbdiff_dir)
       _mkdir_unless_exists(skpdiff_rgbdiff_dir)

       # TODO(epoger): Consider calling skpdiff ONCE for all image pairs,
       # instead of calling it separately for each image pair.
       # Pro: we'll incur less overhead from making repeated system calls,
       # spinning up the skpdiff binary, etc.
       # Con: we would have to wait until all image pairs were loaded before
       # generating any of the diffs?
       # Note(stephana): '--longnames' was added to allow for this
       # case (multiple files at once) versus specifying output diffs
       # directly.
       find_run_binary.run_command(
           [SKPDIFF_BINARY, '-p', expected_image_file, actual_image_file,
            '--jsonp', 'false',
            '--longnames', 'true',
            '--output', skpdiff_summary_file,
            '--differs', 'perceptual', 'different_pixels',
            '--rgbDiffDir', skpdiff_rgbdiff_dir,
            '--whiteDiffDir', skpdiff_whitediff_dir,
            ])

       # Get information out of the skpdiff_summary_file.
       with contextlib.closing(open(skpdiff_summary_file)) as fp:
         data = json.load(fp)

       # For now, we can assume there is only one record in the output summary,
       # since we passed skpdiff only one pair of images.
       record = data['records'][0]
       self._width = record['width']
       self._height = record['height']
       self._diffUrl = os.path.split(record['rgbDiffPath'])[1]
       self._whiteDiffUrl = os.path.split(record['whiteDiffPath'])[1]

       # TODO: make max_diff_per_channel a tuple instead of a list, because the
       # structure is meaningful (first element is red, second is green, etc.)
       # See http://stackoverflow.com/a/626871
       self._max_diff_per_channel = [
           record['maxRedDiff'], record['maxGreenDiff'], record['maxBlueDiff']]
       per_differ_stats = record['diffs']
       for stats in per_differ_stats:
         differ_name = stats['differName']
         if differ_name == 'different_pixels':
           self._num_pixels_differing = stats['pointsOfInterest']
         elif differ_name == 'perceptual':
           perceptual_similarity = stats['result']

       # skpdiff returns the perceptual similarity; convert it to get the
       # perceptual difference percentage.
       # skpdiff outputs -1 if the images are different sizes. Treat any
       # output that does not lie in [0, 1] as having 0% perceptual
       # similarity.
       if not 0 <= perceptual_similarity <= 1:
         perceptual_similarity = 0
       self._perceptual_difference = 100 - (perceptual_similarity * 100)
     finally:
       shutil.rmtree(skpdiff_output_dir)

   # TODO(epoger): Use properties instead of getters throughout.
   # See http://stackoverflow.com/a/6618176
   def get_num_pixels_differing(self):
     """Returns the absolute number of pixels that differ."""
     return self._num_pixels_differing

   def get_percent_pixels_differing(self):
     """Returns the percentage of pixels that differ, as a float between
     0 and 100 (inclusive)."""
     return ((float(self._num_pixels_differing) * 100) /
             (self._width * self._height))

   def get_perceptual_difference(self):
     """Returns the perceptual difference percentage."""
     return self._perceptual_difference

   def get_max_diff_per_channel(self):
     """Returns the maximum difference between the expected and actual images
     for each R/G/B channel, as a list."""
     return self._max_diff_per_channel

   def as_dict(self):
     """Returns a dictionary representation of this DiffRecord, as needed when
     constructing the JSON representation."""
     return {
         KEY__DIFFERENCES__NUM_DIFF_PIXELS: self._num_pixels_differing,
         KEY__DIFFERENCES__PERCENT_DIFF_PIXELS:
             self.get_percent_pixels_differing(),
         KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL: self._max_diff_per_channel,
         KEY__DIFFERENCES__PERCEPTUAL_DIFF: self._perceptual_difference,
         KEY__DIFFERENCES__DIFF_URL: self._diffUrl,
         KEY__DIFFERENCES__WHITE_DIFF_URL: self._whiteDiffUrl,
     }


 class ImageDiffDB(object):
   """ Calculates differences between image pairs, maintaining a database of
   them for download."""

   def __init__(self, storage_root, gs=None,
                num_worker_threads=DEFAULT_NUM_WORKER_THREADS):
     """
     Args:
       storage_root: string; root path within the DB will store all of its stuff
       gs: instance of GSUtils object we can use to download images
       num_worker_threads: how many threads that download images and
           generate diffs simultaneously
     """
     self._storage_root = storage_root
     self._gs = gs

     # Mechanism for reporting queue size periodically.
     self._last_queue_size_reported = None
     self._queue_size_report_lock = threading.RLock()

     # Dictionary of DiffRecords, keyed by (expected_image_locator,
     # actual_image_locator) tuples.
     # Values can also be _DIFFRECORD_PENDING, _DIFFRECORD_FAILED.
     #
     # Any thread that modifies _diff_dict must first acquire
     # _diff_dict_writelock!
     #
     # TODO(epoger): Disk is limitless, but RAM is not... so, we should probably
     # remove items from self._diff_dict if they haven't been accessed for a
     # long time.  We can always regenerate them by diffing the images we
     # previously downloaded to local disk.
     # I guess we should figure out how expensive it is to download vs diff the
     # image pairs... if diffing them is expensive too, we can write these
     # _diff_dict objects out to disk if there's too many to hold in RAM.
     # Or we could use virtual memory to handle that automatically.
     self._diff_dict = {}
     self._diff_dict_writelock = threading.RLock()

     # Set up the queue for asynchronously loading DiffRecords, and start the
     # worker threads reading from it.
     # The queue maxsize must be 0 (infinite size queue), so that asynchronous
     # calls can return as soon as possible.
     self._tasks_queue = Queue.Queue(maxsize=0)
     self._workers = []
     for i in range(num_worker_threads):
       worker = threading.Thread(target=self.worker, args=(i,))
       worker.daemon = True
       worker.start()
       self._workers.append(worker)

   def log_queue_size_if_changed(self, limit_verbosity=True):
     """Log the size of self._tasks_queue, if it has changed since the last call.

     Reports the current queue size, using log.info(), unless the queue is the
     same size as the last time we reported it.

     Args:
       limit_verbosity: if True, only log if the queue size is a multiple of
           QUEUE_LOGGING_GRANULARITY
     """
     # Acquire the lock, to synchronize access to self._last_queue_size_reported
     self._queue_size_report_lock.acquire()
     try:
       size = self._tasks_queue.qsize()
       if size == self._last_queue_size_reported:
         return
       if limit_verbosity and (size % QUEUE_LOGGING_GRANULARITY != 0):
         return
       logging.info('tasks_queue size is %d' % size)
       self._last_queue_size_reported = size
     finally:
       self._queue_size_report_lock.release()

   def worker(self, worker_num):
     """Launch a worker thread that pulls tasks off self._tasks_queue.

     Args:
       worker_num: (integer) which worker this is
     """
     while True:
       self.log_queue_size_if_changed()
       params = self._tasks_queue.get()
       key, expected_image_url, actual_image_url = params
       try:
         diff_record = DiffRecord(
             self._gs, self._storage_root,
             expected_image_url=expected_image_url,
             expected_image_locator=key[0],
             actual_image_url=actual_image_url,
             actual_image_locator=key[1])
       except Exception:
         logging.exception(
             'exception while creating DiffRecord for key %s' % str(key))
         diff_record = _DIFFRECORD_FAILED
       self._diff_dict_writelock.acquire()
       try:
         self._diff_dict[key] = diff_record
       finally:
         self._diff_dict_writelock.release()

   @property
   def storage_root(self):
     return self._storage_root

   def add_image_pair(self,
                      expected_image_url, expected_image_locator,
                      actual_image_url, actual_image_locator):
     """Asynchronously prepare a DiffRecord for a pair of images.

     This method will return quickly; calls to get_diff_record() will block
     until the DiffRecord is available (or we have given up on creating it).

     If we already have a DiffRecord for this particular image pair, no work
     will be done.

     If expected_image_url (or its locator) is None, just download actual_image.
     If actual_image_url (or its locator) is None, just download expected_image.

     Args:
       expected_image_url: file, GS, or HTTP url from which we will download the
           expected image
       expected_image_locator: a unique ID string under which we will store the
           expected image within storage_root (probably including a checksum to
           guarantee uniqueness)
       actual_image_url: file, GS, or HTTP url from which we will download the
           actual image
       actual_image_locator: a unique ID string under which we will store the
           actual image within storage_root (probably including a checksum to
           guarantee uniqueness)
     """
     expected_image_locator = _sanitize_locator(expected_image_locator)
     actual_image_locator = _sanitize_locator(actual_image_locator)
     key = (expected_image_locator, actual_image_locator)
     must_add_to_queue = False

     self._diff_dict_writelock.acquire()
     try:
       if not key in self._diff_dict:
         # If we have already requested a diff between these two images,
         # we don't need to request it again.
         must_add_to_queue = True
         self._diff_dict[key] = _DIFFRECORD_PENDING
     finally:
       self._diff_dict_writelock.release()

     if must_add_to_queue:
       self._tasks_queue.put((key, expected_image_url, actual_image_url))
       self.log_queue_size_if_changed()

   def get_diff_record(self, expected_image_locator, actual_image_locator):
     """Returns the DiffRecord for this image pair.

     This call will block until the diff record is available, or we were unable
     to generate it.

     Args:
       expected_image_locator: a unique ID string under which we will store the
           expected image within storage_root (probably including a checksum to
           guarantee uniqueness)
       actual_image_locator: a unique ID string under which we will store the
           actual image within storage_root (probably including a checksum to
           guarantee uniqueness)

     Returns the DiffRecord for this image pair, or None if we were unable to
     generate one.
     """
     key = (_sanitize_locator(expected_image_locator),
            _sanitize_locator(actual_image_locator))
     diff_record = self._diff_dict[key]

     # If we have no results yet, block until we do.
     while diff_record == _DIFFRECORD_PENDING:
       time.sleep(1)
       diff_record = self._diff_dict[key]

     # Once we have the result...
     if diff_record == _DIFFRECORD_FAILED:
       logging.error(
           'failed to create a DiffRecord for expected_image_locator=%s , '
           'actual_image_locator=%s' % (
               expected_image_locator, actual_image_locator))
       return None
     else:
       return diff_record


 # Utility functions

 def _download_file(gs, local_filepath, url):
   """Download a file from url to local_filepath, unless it is already there.

   Args:
     gs: instance of GSUtils object, in case the url points at Google Storage
     local_filepath: path on local disk where the image should be stored
     url: HTTP or GS URL from which we can download the image if we don't have
         it yet
   """
   global global_file_collisions
   if not os.path.exists(local_filepath):
     _mkdir_unless_exists(os.path.dirname(local_filepath))

     # First download the file contents into a unique filename, and
     # then rename that file.  That way, if multiple threads are downloading
     # the same filename at the same time, they won't interfere with each
     # other (they will both download the file, and one will "win" in the end)
     temp_filename = '%s-%d' % (local_filepath,
                                threading.current_thread().ident)
     if gs_utils.GSUtils.is_gs_url(url):
       (bucket, path) = gs_utils.GSUtils.split_gs_url(url)
       gs.download_file(source_bucket=bucket, source_path=path,
                        dest_path=temp_filename)
     else:
       with contextlib.closing(urllib.urlopen(url)) as url_handle:
         with open(temp_filename, 'wb') as file_handle:
           shutil.copyfileobj(fsrc=url_handle, fdst=file_handle)

     # Rename the file to its real filename.
     # Keep count of how many colliding downloads we encounter;
     # if it's a large number, we may want to change our download strategy
     # to minimize repeated downloads.
     if os.path.exists(local_filepath):
       global_file_collisions += 1
     else:
       os.rename(temp_filename, local_filepath)


 def _mkdir_unless_exists(path):
   """Unless path refers to an already-existing directory, create it.

   Args:
     path: path on local disk
   """
   try:
     os.makedirs(path)
   except OSError as e:
     if e.errno == errno.EEXIST:
       pass


 def _sanitize_locator(locator):
   """Returns a sanitized version of a locator (one in which we know none of the
   characters will have special meaning in filenames).

   Args:
     locator: string, or something that can be represented as a string.
         If None or '', it is returned without modification, because empty
         locators have a particular meaning ("there is no image for this")
   """
   if locator:
     return DISALLOWED_FILEPATH_CHAR_REGEX.sub('_', str(locator))
   else:
     return locator
	#!/usr/bin/python

	"""
	Copyright 2013 Google Inc.

	Use of this source code is governed by a BSD-style license that can be
	found in the LICENSE file.

	Calulate differences between image pairs, and store them in a database.
	"""

	# System-level imports
	import contextlib
	import errno
	import json
	import logging
	import os
	import Queue
	import re
	import shutil
	import tempfile
	import threading
	import time
	import urllib

	# Must fix up PYTHONPATH before importing from within Skia
	import rs_fixpypath # pylint: disable=W0611

	# Imports from within Skia
	import find_run_binary
	from py.utils import gs_utils


	SKPDIFF_BINARY = find_run_binary.find_path_to_program('skpdiff')

	DEFAULT_IMAGE_SUFFIX = '.png'
	DEFAULT_IMAGES_SUBDIR = 'images'
	# TODO(epoger): Figure out a better default number of threads; for now,
	# using a conservative default value.
	DEFAULT_NUM_WORKER_THREADS = 1

	DISALLOWED_FILEPATH_CHAR_REGEX = re.compile('[^\w\-]')

	RGBDIFFS_SUBDIR = 'diffs'
	WHITEDIFFS_SUBDIR = 'whitediffs'

	# Keys used within DiffRecord dictionary representations.
	# NOTE: Keep these in sync with static/constants.js
	KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL = 'maxDiffPerChannel'
	KEY__DIFFERENCES__NUM_DIFF_PIXELS = 'numDifferingPixels'
	KEY__DIFFERENCES__PERCENT_DIFF_PIXELS = 'percentDifferingPixels'
	KEY__DIFFERENCES__PERCEPTUAL_DIFF = 'perceptualDifference'
	KEY__DIFFERENCES__DIFF_URL = 'diffUrl'
	KEY__DIFFERENCES__WHITE_DIFF_URL = 'whiteDiffUrl'

	# Special values within ImageDiffDB._diff_dict
	_DIFFRECORD_FAILED = 'failed'
	_DIFFRECORD_PENDING = 'pending'

	# How often to report tasks_queue size
	QUEUE_LOGGING_GRANULARITY = 1000

	# Temporary variable to keep track of how many times we download
	# the same file in multiple threads.
	# TODO(epoger): Delete this, once we see that the number stays close to 0.
	global_file_collisions = 0


	class DiffRecord(object):
	""" Record of differences between two images. """

	def __init__(self, gs, storage_root,
	expected_image_url, expected_image_locator,
	actual_image_url, actual_image_locator,
	expected_images_subdir=DEFAULT_IMAGES_SUBDIR,
	actual_images_subdir=DEFAULT_IMAGES_SUBDIR,
	image_suffix=DEFAULT_IMAGE_SUFFIX):
	"""Download this pair of images (unless we already have them on local disk),
	and prepare a DiffRecord for them.

	Args:
	gs: instance of GSUtils object we can use to download images
	storage_root: root directory on local disk within which we store all
	images
	expected_image_url: file, GS, or HTTP url from which we will download the
	expected image
	expected_image_locator: a unique ID string under which we will store the
	expected image within storage_root (probably including a checksum to
	guarantee uniqueness)
	actual_image_url: file, GS, or HTTP url from which we will download the
	actual image
	actual_image_locator: a unique ID string under which we will store the
	actual image within storage_root (probably including a checksum to
	guarantee uniqueness)
	expected_images_subdir: the subdirectory expected images are stored in.
	actual_images_subdir: the subdirectory actual images are stored in.
	image_suffix: the suffix of images.
	"""
	expected_image_locator = _sanitize_locator(expected_image_locator)
	actual_image_locator = _sanitize_locator(actual_image_locator)

	# Download the expected/actual images, if we don't have them already.
	expected_image_file = os.path.join(
	storage_root, expected_images_subdir,
	str(expected_image_locator) + image_suffix)
	actual_image_file = os.path.join(
	storage_root, actual_images_subdir,
	str(actual_image_locator) + image_suffix)
	for image_file, image_url in [
	(expected_image_file, expected_image_url),
	(actual_image_file, actual_image_url)]:
	if image_file and image_url:
	try:
	_download_file(gs, image_file, image_url)
	except Exception:
	logging.exception('unable to download image_url %s to file %s' %
	(image_url, image_file))
	raise

	# Return early if we do not need to generate diffs.
	if (expected_image_url == actual_image_url or
	not expected_image_url or not actual_image_url):
	return

	# Get all diff images and values using the skpdiff binary.
	skpdiff_output_dir = tempfile.mkdtemp()
	try:
	skpdiff_summary_file = os.path.join(skpdiff_output_dir,
	'skpdiff-output.json')
	skpdiff_rgbdiff_dir = os.path.join(storage_root, RGBDIFFS_SUBDIR)
	skpdiff_whitediff_dir = os.path.join(storage_root, WHITEDIFFS_SUBDIR)
	_mkdir_unless_exists(skpdiff_rgbdiff_dir)
	_mkdir_unless_exists(skpdiff_rgbdiff_dir)

	# TODO(epoger): Consider calling skpdiff ONCE for all image pairs,
	# instead of calling it separately for each image pair.
	# Pro: we'll incur less overhead from making repeated system calls,
	# spinning up the skpdiff binary, etc.
	# Con: we would have to wait until all image pairs were loaded before
	# generating any of the diffs?
	# Note(stephana): '--longnames' was added to allow for this
	# case (multiple files at once) versus specifying output diffs
	# directly.
	find_run_binary.run_command(
	[SKPDIFF_BINARY, '-p', expected_image_file, actual_image_file,
	'--jsonp', 'false',
	'--longnames', 'true',
	'--output', skpdiff_summary_file,
	'--differs', 'perceptual', 'different_pixels',
	'--rgbDiffDir', skpdiff_rgbdiff_dir,
	'--whiteDiffDir', skpdiff_whitediff_dir,
	])

	# Get information out of the skpdiff_summary_file.
	with contextlib.closing(open(skpdiff_summary_file)) as fp:
	data = json.load(fp)

	# For now, we can assume there is only one record in the output summary,
	# since we passed skpdiff only one pair of images.
	record = data['records'][0]
	self._width = record['width']
	self._height = record['height']
	self._diffUrl = os.path.split(record['rgbDiffPath'])[1]
	self._whiteDiffUrl = os.path.split(record['whiteDiffPath'])[1]

	# TODO: make max_diff_per_channel a tuple instead of a list, because the
	# structure is meaningful (first element is red, second is green, etc.)
	# See http://stackoverflow.com/a/626871
	self._max_diff_per_channel = [
	record['maxRedDiff'], record['maxGreenDiff'], record['maxBlueDiff']]
	per_differ_stats = record['diffs']
	for stats in per_differ_stats:
	differ_name = stats['differName']
	if differ_name == 'different_pixels':
	self._num_pixels_differing = stats['pointsOfInterest']
	elif differ_name == 'perceptual':
	perceptual_similarity = stats['result']

	# skpdiff returns the perceptual similarity; convert it to get the
	# perceptual difference percentage.
	# skpdiff outputs -1 if the images are different sizes. Treat any
	# output that does not lie in [0, 1] as having 0% perceptual
	# similarity.
	if not 0 <= perceptual_similarity <= 1:
	perceptual_similarity = 0
	self._perceptual_difference = 100 - (perceptual_similarity * 100)
	finally:
	shutil.rmtree(skpdiff_output_dir)

	# TODO(epoger): Use properties instead of getters throughout.
	# See http://stackoverflow.com/a/6618176
	def get_num_pixels_differing(self):
	"""Returns the absolute number of pixels that differ."""
	return self._num_pixels_differing

	def get_percent_pixels_differing(self):
	"""Returns the percentage of pixels that differ, as a float between
	0 and 100 (inclusive)."""
	return ((float(self._num_pixels_differing) * 100) /
	(self._width * self._height))

	def get_perceptual_difference(self):
	"""Returns the perceptual difference percentage."""
	return self._perceptual_difference

	def get_max_diff_per_channel(self):
	"""Returns the maximum difference between the expected and actual images
	for each R/G/B channel, as a list."""
	return self._max_diff_per_channel

	def as_dict(self):
	"""Returns a dictionary representation of this DiffRecord, as needed when
	constructing the JSON representation."""
	return {
	KEY__DIFFERENCES__NUM_DIFF_PIXELS: self._num_pixels_differing,
	KEY__DIFFERENCES__PERCENT_DIFF_PIXELS:
	self.get_percent_pixels_differing(),
	KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL: self._max_diff_per_channel,
	KEY__DIFFERENCES__PERCEPTUAL_DIFF: self._perceptual_difference,
	KEY__DIFFERENCES__DIFF_URL: self._diffUrl,
	KEY__DIFFERENCES__WHITE_DIFF_URL: self._whiteDiffUrl,
	}



	class ImageDiffDB(object):
	""" Calculates differences between image pairs, maintaining a database of
	them for download."""

	def __init__(self, storage_root, gs=None,
	num_worker_threads=DEFAULT_NUM_WORKER_THREADS):
	"""
	Args:
	storage_root: string; root path within the DB will store all of its stuff
	gs: instance of GSUtils object we can use to download images
	num_worker_threads: how many threads that download images and
	generate diffs simultaneously
	"""
	self._storage_root = storage_root
	self._gs = gs

	# Mechanism for reporting queue size periodically.
	self._last_queue_size_reported = None
	self._queue_size_report_lock = threading.RLock()

	# Dictionary of DiffRecords, keyed by (expected_image_locator,
	# actual_image_locator) tuples.
	# Values can also be _DIFFRECORD_PENDING, _DIFFRECORD_FAILED.
	#
	# Any thread that modifies _diff_dict must first acquire
	# _diff_dict_writelock!
	#
	# TODO(epoger): Disk is limitless, but RAM is not... so, we should probably
	# remove items from self._diff_dict if they haven't been accessed for a
	# long time. We can always regenerate them by diffing the images we
	# previously downloaded to local disk.
	# I guess we should figure out how expensive it is to download vs diff the
	# image pairs... if diffing them is expensive too, we can write these
	# _diff_dict objects out to disk if there's too many to hold in RAM.
	# Or we could use virtual memory to handle that automatically.
	self._diff_dict = {}
	self._diff_dict_writelock = threading.RLock()

	# Set up the queue for asynchronously loading DiffRecords, and start the
	# worker threads reading from it.
	# The queue maxsize must be 0 (infinite size queue), so that asynchronous
	# calls can return as soon as possible.
	self._tasks_queue = Queue.Queue(maxsize=0)
	self._workers = []
	for i in range(num_worker_threads):
	worker = threading.Thread(target=self.worker, args=(i,))
	worker.daemon = True
	worker.start()
	self._workers.append(worker)

	def log_queue_size_if_changed(self, limit_verbosity=True):
	"""Log the size of self._tasks_queue, if it has changed since the last call.

	Reports the current queue size, using log.info(), unless the queue is the
	same size as the last time we reported it.

	Args:
	limit_verbosity: if True, only log if the queue size is a multiple of
	QUEUE_LOGGING_GRANULARITY
	"""
	# Acquire the lock, to synchronize access to self._last_queue_size_reported
	self._queue_size_report_lock.acquire()
	try:
	size = self._tasks_queue.qsize()
	if size == self._last_queue_size_reported:
	return
	if limit_verbosity and (size % QUEUE_LOGGING_GRANULARITY != 0):
	return
	logging.info('tasks_queue size is %d' % size)
	self._last_queue_size_reported = size
	finally:
	self._queue_size_report_lock.release()

	def worker(self, worker_num):
	"""Launch a worker thread that pulls tasks off self._tasks_queue.

	Args:
	worker_num: (integer) which worker this is
	"""
	while True:
	self.log_queue_size_if_changed()
	params = self._tasks_queue.get()
	key, expected_image_url, actual_image_url = params
	try:
	diff_record = DiffRecord(
	self._gs, self._storage_root,
	expected_image_url=expected_image_url,
	expected_image_locator=key[0],
	actual_image_url=actual_image_url,
	actual_image_locator=key[1])
	except Exception:
	logging.exception(
	'exception while creating DiffRecord for key %s' % str(key))
	diff_record = _DIFFRECORD_FAILED
	self._diff_dict_writelock.acquire()
	try:
	self._diff_dict[key] = diff_record
	finally:
	self._diff_dict_writelock.release()

	@property
	def storage_root(self):
	return self._storage_root

	def add_image_pair(self,
	expected_image_url, expected_image_locator,
	actual_image_url, actual_image_locator):
	"""Asynchronously prepare a DiffRecord for a pair of images.

	This method will return quickly; calls to get_diff_record() will block
	until the DiffRecord is available (or we have given up on creating it).

	If we already have a DiffRecord for this particular image pair, no work
	will be done.

	If expected_image_url (or its locator) is None, just download actual_image.
	If actual_image_url (or its locator) is None, just download expected_image.

	Args:
	expected_image_url: file, GS, or HTTP url from which we will download the
	expected image
	expected_image_locator: a unique ID string under which we will store the
	expected image within storage_root (probably including a checksum to
	guarantee uniqueness)
	actual_image_url: file, GS, or HTTP url from which we will download the
	actual image
	actual_image_locator: a unique ID string under which we will store the
	actual image within storage_root (probably including a checksum to
	guarantee uniqueness)
	"""
	expected_image_locator = _sanitize_locator(expected_image_locator)
	actual_image_locator = _sanitize_locator(actual_image_locator)
	key = (expected_image_locator, actual_image_locator)
	must_add_to_queue = False

	self._diff_dict_writelock.acquire()
	try:
	if not key in self._diff_dict:
	# If we have already requested a diff between these two images,
	# we don't need to request it again.
	must_add_to_queue = True
	self._diff_dict[key] = _DIFFRECORD_PENDING
	finally:
	self._diff_dict_writelock.release()

	if must_add_to_queue:
	self._tasks_queue.put((key, expected_image_url, actual_image_url))
	self.log_queue_size_if_changed()

	def get_diff_record(self, expected_image_locator, actual_image_locator):
	"""Returns the DiffRecord for this image pair.

	This call will block until the diff record is available, or we were unable
	to generate it.

	Args:
	expected_image_locator: a unique ID string under which we will store the
	expected image within storage_root (probably including a checksum to
	guarantee uniqueness)
	actual_image_locator: a unique ID string under which we will store the
	actual image within storage_root (probably including a checksum to
	guarantee uniqueness)

	Returns the DiffRecord for this image pair, or None if we were unable to
	generate one.
	"""
	key = (_sanitize_locator(expected_image_locator),
	_sanitize_locator(actual_image_locator))
	diff_record = self._diff_dict[key]

	# If we have no results yet, block until we do.
	while diff_record == _DIFFRECORD_PENDING:
	time.sleep(1)
	diff_record = self._diff_dict[key]

	# Once we have the result...
	if diff_record == _DIFFRECORD_FAILED:
	logging.error(
	'failed to create a DiffRecord for expected_image_locator=%s , '
	'actual_image_locator=%s' % (
	expected_image_locator, actual_image_locator))
	return None
	else:
	return diff_record


	# Utility functions

	def _download_file(gs, local_filepath, url):
	"""Download a file from url to local_filepath, unless it is already there.

	Args:
	gs: instance of GSUtils object, in case the url points at Google Storage
	local_filepath: path on local disk where the image should be stored
	url: HTTP or GS URL from which we can download the image if we don't have
	it yet
	"""
	global global_file_collisions
	if not os.path.exists(local_filepath):
	_mkdir_unless_exists(os.path.dirname(local_filepath))

	# First download the file contents into a unique filename, and
	# then rename that file. That way, if multiple threads are downloading
	# the same filename at the same time, they won't interfere with each
	# other (they will both download the file, and one will "win" in the end)
	temp_filename = '%s-%d' % (local_filepath,
	threading.current_thread().ident)
	if gs_utils.GSUtils.is_gs_url(url):
	(bucket, path) = gs_utils.GSUtils.split_gs_url(url)
	gs.download_file(source_bucket=bucket, source_path=path,
	dest_path=temp_filename)
	else:
	with contextlib.closing(urllib.urlopen(url)) as url_handle:
	with open(temp_filename, 'wb') as file_handle:
	shutil.copyfileobj(fsrc=url_handle, fdst=file_handle)

	# Rename the file to its real filename.
	# Keep count of how many colliding downloads we encounter;
	# if it's a large number, we may want to change our download strategy
	# to minimize repeated downloads.
	if os.path.exists(local_filepath):
	global_file_collisions += 1
	else:
	os.rename(temp_filename, local_filepath)


	def _mkdir_unless_exists(path):
	"""Unless path refers to an already-existing directory, create it.

	Args:
	path: path on local disk
	"""
	try:
	os.makedirs(path)
	except OSError as e:
	if e.errno == errno.EEXIST:
	pass


	def _sanitize_locator(locator):
	"""Returns a sanitized version of a locator (one in which we know none of the
	characters will have special meaning in filenames).

	Args:
	locator: string, or something that can be represented as a string.
	If None or '', it is returned without modification, because empty
	locators have a particular meaning ("there is no image for this")
	"""
	if locator:
	return DISALLOWED_FILEPATH_CHAR_REGEX.sub('_', str(locator))
	else:
	return locator