blob: fb98bb73e9a45b3381dc7fd86ae08f26b983cb6b [file] [log] [blame]
#!/usr/bin/env python
import os
import os.path
import sys
import tempfile
import xml.parsers.expat
"""
Scans each resource file in res/values/ looking for duplicates.
All but the last occurrence of resource definition are removed.
This creates no semantic changes, the resulting APK when built
should contain the same definition.
"""
class Duplicate:
"""A small struct to maintain the positions of a Duplicate resource definition."""
def __init__(self, name, product, depth, start, end):
self.name = name
self.product = product
self.depth = depth
self.start = start
self.end = end
class ResourceDefinitionLocator:
"""Callback class for xml.parsers.expat which records resource definitions and their
locations.
"""
def __init__(self, parser):
self.resource_definitions = {}
self._parser = parser
self._depth = 0
self._current_resource = None
def start_element(self, tag_name, attrs):
self._depth += 1
if self._depth == 2 and tag_name not in ["public", "java-symbol", "eat-comment", "skip"]:
resource_name = None
product = ""
try:
product = attrs["product"]
except KeyError:
pass
if tag_name == "item":
resource_name = "{0}/{1}".format(attrs["type"], attrs["name"])
else:
resource_name = "{0}/{1}".format(tag_name, attrs["name"])
self._current_resource = Duplicate(
resource_name,
product,
self._depth,
(self._parser.CurrentLineNumber - 1, self._parser.CurrentColumnNumber),
None)
def end_element(self, tag_name):
if self._current_resource and self._depth == self._current_resource.depth:
# Record the end position of the element, which is the length of the name
# plus the </> symbols (len("</>") == 3).
self._current_resource.end = (self._parser.CurrentLineNumber - 1,
self._parser.CurrentColumnNumber + 3 + len(tag_name))
key_name = "{0}:{1}".format(self._current_resource.name,
self._current_resource.product)
try:
self.resource_definitions[key_name] += [self._current_resource]
except KeyError:
self.resource_definitions[key_name] = [self._current_resource]
self._current_resource = None
self._depth -= 1
def remove_duplicates(xml_path):
"""Reads the input file and generates an output file with any duplicate
resources removed, keeping the last occurring definition and removing
the others. The output is written to a temporary and then renamed
to the original file name.
"""
input = ""
with open(xml_path) as fin:
input = fin.read()
parser = xml.parsers.expat.ParserCreate("utf-8")
parser.returns_unicode = True
tracker = ResourceDefinitionLocator(parser)
parser.StartElementHandler = tracker.start_element
parser.EndElementHandler = tracker.end_element
parser.Parse(input)
# Treat the input as UTF-8 or else column numbers will be wrong.
input_lines = input.decode('utf-8').splitlines(True)
# Extract the duplicate resource definitions, ignoring the last definition
# which will take precedence and be left intact.
duplicates = []
for res_name, entries in tracker.resource_definitions.iteritems():
if len(entries) > 1:
duplicates += entries[:-1]
# Sort the duplicates so that they are in order. That way we only do one pass.
duplicates = sorted(duplicates, key=lambda x: x.start)
last_line_no = 0
last_col_no = 0
output_lines = []
current_line = ""
for definition in duplicates:
print "{0}:{1}:{2}: removing duplicate resource '{3}'".format(
xml_path, definition.start[0] + 1, definition.start[1], definition.name)
if last_line_no < definition.start[0]:
# The next definition is on a new line, so write what we have
# to the output.
new_line = current_line + input_lines[last_line_no][last_col_no:]
if not new_line.isspace():
output_lines.append(new_line)
current_line = ""
last_col_no = 0
last_line_no += 1
# Copy all the lines up until this one.
for line_to_copy in xrange(last_line_no, definition.start[0]):
output_lines.append(input_lines[line_to_copy])
# Add to the existing line we're building, by including the prefix of this line
# and skipping the lines and characters until the end of this duplicate definition.
last_line_no = definition.start[0]
current_line += input_lines[last_line_no][last_col_no:definition.start[1]]
last_line_no = definition.end[0]
last_col_no = definition.end[1]
new_line = current_line + input_lines[last_line_no][last_col_no:]
if not new_line.isspace():
output_lines.append(new_line)
current_line = ""
last_line_no += 1
last_col_no = 0
for line_to_copy in xrange(last_line_no, len(input_lines)):
output_lines.append(input_lines[line_to_copy])
if len(duplicates) > 0:
print "{0}: writing deduped copy...".format(xml_path)
# Write the lines to a temporary file.
dirname, basename = os.path.split(xml_path)
temp_name = ""
with tempfile.NamedTemporaryFile(prefix=basename, dir=dirname, delete=False) as temp:
temp_name = temp.name
for line in output_lines:
temp.write(line.encode('utf-8'))
# Now rename that file to the original so we have an atomic write that is consistent.
os.rename(temp.name, xml_path)
def enumerate_files(res_path):
"""Enumerates all files in the resource directory that are XML files and
within a values-* subdirectory. These types of files end up compiled
in the resources.arsc table of an APK.
"""
values_directories = os.listdir(res_path)
values_directories = filter(lambda f: f.startswith('values'), values_directories)
values_directories = map(lambda f: os.path.join(res_path, f), values_directories)
all_files = []
for dir in values_directories:
files = os.listdir(dir)
files = filter(lambda f: f.endswith('.xml'), files)
files = map(lambda f: os.path.join(dir, f), files)
all_files += files
return all_files
if __name__ == '__main__':
if len(sys.argv) < 2:
print >> sys.stderr, "please specify a path to a resource directory"
sys.exit(1)
res_path = os.path.abspath(sys.argv[1])
print "looking in {0} ...".format(res_path)
for f in enumerate_files(res_path):
print "checking {0} ...".format(f)
remove_duplicates(f)