|  | #!/usr/bin/env python | 
|  | """ | 
|  | Unicode case folding database conversion utility | 
|  |  | 
|  | Parses the database and generates a C++ function which implements the case | 
|  | folding algorithm. The database entries are of the form: | 
|  |  | 
|  | <code>; <status>; <mapping>; # <name> | 
|  |  | 
|  | <status> can be one of four characters: | 
|  | C - Common mappings | 
|  | S - mappings for Simple case folding | 
|  | F - mappings for Full case folding | 
|  | T - special case for Turkish I characters | 
|  |  | 
|  | Right now this generates a function which implements simple case folding (C+S | 
|  | entries). | 
|  | """ | 
|  |  | 
|  | import sys | 
|  | import re | 
|  | import urllib2 | 
|  |  | 
|  | # This variable will body of the mappings function | 
|  | body = "" | 
|  |  | 
|  | # Reads file line-by-line, extracts Common and Simple case fold mappings and | 
|  | # returns a (from_char, to_char, from_name) tuple. | 
|  | def mappings(f): | 
|  | previous_from = -1 | 
|  | expr = re.compile(r'^(.*); [CS]; (.*); # (.*)') | 
|  | for line in f: | 
|  | m = expr.match(line) | 
|  | if not m: continue | 
|  | from_char = int(m.group(1), 16) | 
|  | to_char = int(m.group(2), 16) | 
|  | from_name = m.group(3) | 
|  |  | 
|  | if from_char <= previous_from: | 
|  | raise Exception("Duplicate or unsorted characters in input") | 
|  | yield from_char, to_char, from_name | 
|  | previous_from = from_char | 
|  |  | 
|  | # Computes the shift (to_char - from_char) in a mapping. | 
|  | def shift(mapping): | 
|  | return mapping[1] - mapping[0] | 
|  |  | 
|  | # Computes the stride (from_char2 - from_char1) of two mappings. | 
|  | def stride2(mapping1, mapping2): | 
|  | return mapping2[0] - mapping1[0] | 
|  |  | 
|  | # Computes the stride of a list of mappings. The list should have at least two | 
|  | # mappings. All mappings in the list are assumed to have the same stride. | 
|  | def stride(block): | 
|  | return stride2(block[0], block[1]) | 
|  |  | 
|  |  | 
|  | # b is a list of mappings. All the mappings are assumed to have the same | 
|  | # shift and the stride between adjecant mappings (if any) is constant. | 
|  | def dump_block(b): | 
|  | global body | 
|  |  | 
|  | if len(b) == 1: | 
|  | # Special case for handling blocks of length 1. We don't even need to | 
|  | # emit the "if (C < X) return C" check below as all characters in this | 
|  | # range will be caught by the "C < X" check emitted by the first | 
|  | # non-trivial block. | 
|  | body  += "  // {2}\n  if (C == {0:#06x})\n    return {1:#06x};\n".format(*b[0]) | 
|  | return | 
|  |  | 
|  | first = b[0][0] | 
|  | last = first + stride(b) * (len(b)-1) | 
|  | modulo = first % stride(b) | 
|  |  | 
|  | # All characters before this block map to themselves. | 
|  | body += "  if (C < {0:#06x})\n    return C;\n".format(first) | 
|  | body += "  // {0} characters\n".format(len(b)) | 
|  |  | 
|  | # Generic pattern: check upper bound (lower bound is checked by the "if" | 
|  | # above) and modulo of C, return C+shift. | 
|  | pattern = "  if (C <= {0:#06x} && C % {1} == {2})\n    return C + {3};\n" | 
|  |  | 
|  | if stride(b) == 2 and shift(b[0]) == 1 and modulo == 0: | 
|  | # Special case: | 
|  | # We can elide the modulo-check because the expression "C|1" will map | 
|  | # the intervening characters to themselves. | 
|  | pattern = "  if (C <= {0:#06x})\n    return C | 1;\n" | 
|  | elif stride(b) == 1: | 
|  | # Another special case: X % 1 is always zero, so don't emit the | 
|  | # modulo-check. | 
|  | pattern = "  if (C <= {0:#06x})\n    return C + {3};\n" | 
|  |  | 
|  | body += pattern.format(last, stride(b), modulo, shift(b[0])) | 
|  |  | 
|  | current_block = [] | 
|  | f = urllib2.urlopen(sys.argv[1]) | 
|  | for m in mappings(f): | 
|  | if len(current_block) == 0: | 
|  | current_block.append(m) | 
|  | continue | 
|  |  | 
|  | if shift(current_block[0]) != shift(m): | 
|  | # Incompatible shift, start a new block. | 
|  | dump_block(current_block) | 
|  | current_block = [m] | 
|  | continue | 
|  |  | 
|  | if len(current_block) == 1 or stride(current_block) == stride2(current_block[-1], m): | 
|  | current_block.append(m) | 
|  | continue | 
|  |  | 
|  | # Incompatible stride, start a new block. | 
|  | dump_block(current_block) | 
|  | current_block = [m] | 
|  | f.close() | 
|  |  | 
|  | dump_block(current_block) | 
|  |  | 
|  | print '//===---------- Support/UnicodeCaseFold.cpp -------------------------------===//' | 
|  | print '//' | 
|  | print '// This file was generated by utils/unicode-case-fold.py from the Unicode' | 
|  | print '// case folding database at' | 
|  | print '//   ', sys.argv[1] | 
|  | print '//' | 
|  | print '// To regenerate this file, run:' | 
|  | print '//   utils/unicode-case-fold.py \\' | 
|  | print '//     "{}" \\'.format(sys.argv[1]) | 
|  | print '//     > lib/Support/UnicodeCaseFold.cpp' | 
|  | print '//' | 
|  | print '//===----------------------------------------------------------------------===//' | 
|  | print '' | 
|  | print '#include "llvm/Support/Unicode.h"' | 
|  | print '' | 
|  | print "int llvm::sys::unicode::foldCharSimple(int C) {" | 
|  | print body | 
|  | print "  return C;" | 
|  | print "}" |