| """ Unicode Mapping Parser and Codec Generator. | 
 |  | 
 | This script parses Unicode mapping files as available from the Unicode | 
 | site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec | 
 | modules from them. The codecs use the standard character mapping codec | 
 | to actually apply the mapping. | 
 |  | 
 | Synopsis: gencodec.py dir codec_prefix | 
 |  | 
 | All files in dir are scanned and those producing non-empty mappings | 
 | will be written to <codec_prefix><mapname>.py with <mapname> being the | 
 | first part of the map's filename ('a' in a.b.c.txt) converted to | 
 | lowercase with hyphens replaced by underscores. | 
 |  | 
 | The tool also writes marshalled versions of the mapping tables to the | 
 | same location (with .mapping extension). | 
 |  | 
 | Written by Marc-Andre Lemburg (mal@lemburg.com). | 
 |  | 
 | (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. | 
 | (c) Copyright Guido van Rossum, 2000. | 
 |  | 
 | Table generation: | 
 | (c) Copyright Marc-Andre Lemburg, 2005. | 
 |     Licensed to PSF under a Contributor Agreement. | 
 |  | 
 | """#" | 
 |  | 
 | import re, os, marshal, codecs | 
 |  | 
 | # Maximum allowed size of charmap tables | 
 | MAX_TABLE_SIZE = 8192 | 
 |  | 
 | # Standard undefined Unicode code point | 
 | UNI_UNDEFINED = unichr(0xFFFE) | 
 |  | 
 | mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)' | 
 |                    '\s+' | 
 |                    '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)' | 
 |                    '\s*' | 
 |                    '(#.+)?') | 
 |  | 
 | def parsecodes(codes, len=len, range=range): | 
 |  | 
 |     """ Converts code combinations to either a single code integer | 
 |         or a tuple of integers. | 
 |  | 
 |         meta-codes (in angular brackets, e.g. <LR> and <RL>) are | 
 |         ignored. | 
 |  | 
 |         Empty codes or illegal ones are returned as None. | 
 |  | 
 |     """ | 
 |     if not codes: | 
 |         return None | 
 |     l = codes.split('+') | 
 |     if len(l) == 1: | 
 |         return int(l[0],16) | 
 |     for i in range(len(l)): | 
 |         try: | 
 |             l[i] = int(l[i],16) | 
 |         except ValueError: | 
 |             l[i] = None | 
 |     l = [x for x in l if x is not None] | 
 |     if len(l) == 1: | 
 |         return l[0] | 
 |     else: | 
 |         return tuple(l) | 
 |  | 
 | def readmap(filename): | 
 |  | 
 |     f = open(filename,'r') | 
 |     lines = f.readlines() | 
 |     f.close() | 
 |     enc2uni = {} | 
 |     identity = [] | 
 |     unmapped = range(256) | 
 |  | 
 |     # UTC mapping tables per convention don't include the identity | 
 |     # mappings for code points 0x00 - 0x1F and 0x7F, unless these are | 
 |     # explicitly mapped to different characters or undefined | 
 |     for i in range(32) + [127]: | 
 |         identity.append(i) | 
 |         unmapped.remove(i) | 
 |         enc2uni[i] = (i, 'CONTROL CHARACTER') | 
 |  | 
 |     for line in lines: | 
 |         line = line.strip() | 
 |         if not line or line[0] == '#': | 
 |             continue | 
 |         m = mapRE.match(line) | 
 |         if not m: | 
 |             #print '* not matched: %s' % repr(line) | 
 |             continue | 
 |         enc,uni,comment = m.groups() | 
 |         enc = parsecodes(enc) | 
 |         uni = parsecodes(uni) | 
 |         if comment is None: | 
 |             comment = '' | 
 |         else: | 
 |             comment = comment[1:].strip() | 
 |         if enc < 256: | 
 |             if enc in unmapped: | 
 |                 unmapped.remove(enc) | 
 |             if enc == uni: | 
 |                 identity.append(enc) | 
 |             enc2uni[enc] = (uni,comment) | 
 |         else: | 
 |             enc2uni[enc] = (uni,comment) | 
 |  | 
 |     # If there are more identity-mapped entries than unmapped entries, | 
 |     # it pays to generate an identity dictionary first, and add explicit | 
 |     # mappings to None for the rest | 
 |     if len(identity) >= len(unmapped): | 
 |         for enc in unmapped: | 
 |             enc2uni[enc] = (None, "") | 
 |         enc2uni['IDENTITY'] = 256 | 
 |  | 
 |     return enc2uni | 
 |  | 
 | def hexrepr(t, precision=4): | 
 |  | 
 |     if t is None: | 
 |         return 'None' | 
 |     try: | 
 |         len(t) | 
 |     except: | 
 |         return '0x%0*X' % (precision, t) | 
 |     try: | 
 |         return '(' + ', '.join(['0x%0*X' % (precision, item) | 
 |                                 for item in t]) + ')' | 
 |     except TypeError, why: | 
 |         print '* failed to convert %r: %s' % (t, why) | 
 |         raise | 
 |  | 
 | def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)): | 
 |  | 
 |     l = [] | 
 |     append = l.append | 
 |     if "IDENTITY" in map: | 
 |         append("%s = codecs.make_identity_dict(range(%d))" % | 
 |                (varname, map["IDENTITY"])) | 
 |         append("%s.update({" % varname) | 
 |         splits = 1 | 
 |         del map["IDENTITY"] | 
 |         identity = 1 | 
 |     else: | 
 |         append("%s = {" % varname) | 
 |         splits = 0 | 
 |         identity = 0 | 
 |  | 
 |     mappings = sorted(map.items()) | 
 |     i = 0 | 
 |     key_precision, value_precision = precisions | 
 |     for mapkey, mapvalue in mappings: | 
 |         mapcomment = '' | 
 |         if isinstance(mapkey, tuple): | 
 |             (mapkey, mapcomment) = mapkey | 
 |         if isinstance(mapvalue, tuple): | 
 |             (mapvalue, mapcomment) = mapvalue | 
 |         if mapkey is None: | 
 |             continue | 
 |         if (identity and | 
 |             mapkey == mapvalue and | 
 |             mapkey < 256): | 
 |             # No need to include identity mappings, since these | 
 |             # are already set for the first 256 code points. | 
 |             continue | 
 |         key = hexrepr(mapkey, key_precision) | 
 |         value = hexrepr(mapvalue, value_precision) | 
 |         if mapcomment and comments: | 
 |             append('    %s: %s,\t#  %s' % (key, value, mapcomment)) | 
 |         else: | 
 |             append('    %s: %s,' % (key, value)) | 
 |         i += 1 | 
 |         if i == 4096: | 
 |             # Split the definition into parts to that the Python | 
 |             # parser doesn't dump core | 
 |             if splits == 0: | 
 |                 append('}') | 
 |             else: | 
 |                 append('})') | 
 |             append('%s.update({' % varname) | 
 |             i = 0 | 
 |             splits = splits + 1 | 
 |     if splits == 0: | 
 |         append('}') | 
 |     else: | 
 |         append('})') | 
 |  | 
 |     return l | 
 |  | 
 | def python_tabledef_code(varname, map, comments=1, key_precision=2): | 
 |  | 
 |     l = [] | 
 |     append = l.append | 
 |     append('%s = (' % varname) | 
 |  | 
 |     # Analyze map and create table dict | 
 |     mappings = sorted(map.items()) | 
 |     table = {} | 
 |     maxkey = 0 | 
 |     if 'IDENTITY' in map: | 
 |         for key in range(256): | 
 |             table[key] = (key, '') | 
 |         maxkey = 255 | 
 |         del map['IDENTITY'] | 
 |     for mapkey, mapvalue in mappings: | 
 |         mapcomment = '' | 
 |         if isinstance(mapkey, tuple): | 
 |             (mapkey, mapcomment) = mapkey | 
 |         if isinstance(mapvalue, tuple): | 
 |             (mapvalue, mapcomment) = mapvalue | 
 |         if mapkey is None: | 
 |             continue | 
 |         table[mapkey] = (mapvalue, mapcomment) | 
 |         if mapkey > maxkey: | 
 |             maxkey = mapkey | 
 |     if maxkey > MAX_TABLE_SIZE: | 
 |         # Table too large | 
 |         return None | 
 |  | 
 |     # Create table code | 
 |     for key in range(maxkey + 1): | 
 |         if key not in table: | 
 |             mapvalue = None | 
 |             mapcomment = 'UNDEFINED' | 
 |         else: | 
 |             mapvalue, mapcomment = table[key] | 
 |         if mapvalue is None: | 
 |             mapchar = UNI_UNDEFINED | 
 |         else: | 
 |             if isinstance(mapvalue, tuple): | 
 |                 # 1-n mappings not supported | 
 |                 return None | 
 |             else: | 
 |                 mapchar = unichr(mapvalue) | 
 |         if mapcomment and comments: | 
 |             append('    %r\t#  %s -> %s' % (mapchar, | 
 |                                             hexrepr(key, key_precision), | 
 |                                             mapcomment)) | 
 |         else: | 
 |             append('    %r' % mapchar) | 
 |  | 
 |     append(')') | 
 |     return l | 
 |  | 
 | def codegen(name, map, encodingname, comments=1): | 
 |  | 
 |     """ Returns Python source for the given map. | 
 |  | 
 |         Comments are included in the source, if comments is true (default). | 
 |  | 
 |     """ | 
 |     # Generate code | 
 |     decoding_map_code = python_mapdef_code( | 
 |         'decoding_map', | 
 |         map, | 
 |         comments=comments) | 
 |     decoding_table_code = python_tabledef_code( | 
 |         'decoding_table', | 
 |         map, | 
 |         comments=comments) | 
 |     encoding_map_code = python_mapdef_code( | 
 |         'encoding_map', | 
 |         codecs.make_encoding_map(map), | 
 |         comments=comments, | 
 |         precisions=(4, 2)) | 
 |  | 
 |     if decoding_table_code: | 
 |         suffix = 'table' | 
 |     else: | 
 |         suffix = 'map' | 
 |  | 
 |     l = [ | 
 |         '''\ | 
 | """ Python Character Mapping Codec %s generated from '%s' with gencodec.py. | 
 |  | 
 | """#" | 
 |  | 
 | import codecs | 
 |  | 
 | ### Codec APIs | 
 |  | 
 | class Codec(codecs.Codec): | 
 |  | 
 |     def encode(self,input,errors='strict'): | 
 |         return codecs.charmap_encode(input,errors,encoding_%s) | 
 |  | 
 |     def decode(self,input,errors='strict'): | 
 |         return codecs.charmap_decode(input,errors,decoding_%s) | 
 | ''' % (encodingname, name, suffix, suffix)] | 
 |     l.append('''\ | 
 | class IncrementalEncoder(codecs.IncrementalEncoder): | 
 |     def encode(self, input, final=False): | 
 |         return codecs.charmap_encode(input,self.errors,encoding_%s)[0] | 
 |  | 
 | class IncrementalDecoder(codecs.IncrementalDecoder): | 
 |     def decode(self, input, final=False): | 
 |         return codecs.charmap_decode(input,self.errors,decoding_%s)[0]''' % | 
 |         (suffix, suffix)) | 
 |  | 
 |     l.append(''' | 
 | class StreamWriter(Codec,codecs.StreamWriter): | 
 |     pass | 
 |  | 
 | class StreamReader(Codec,codecs.StreamReader): | 
 |     pass | 
 |  | 
 | ### encodings module API | 
 |  | 
 | def getregentry(): | 
 |     return codecs.CodecInfo( | 
 |         name=%r, | 
 |         encode=Codec().encode, | 
 |         decode=Codec().decode, | 
 |         incrementalencoder=IncrementalEncoder, | 
 |         incrementaldecoder=IncrementalDecoder, | 
 |         streamreader=StreamReader, | 
 |         streamwriter=StreamWriter, | 
 |     ) | 
 | ''' % encodingname.replace('_', '-')) | 
 |  | 
 |     # Add decoding table or map (with preference to the table) | 
 |     if not decoding_table_code: | 
 |         l.append(''' | 
 | ### Decoding Map | 
 | ''') | 
 |         l.extend(decoding_map_code) | 
 |     else: | 
 |         l.append(''' | 
 | ### Decoding Table | 
 | ''') | 
 |         l.extend(decoding_table_code) | 
 |  | 
 |     # Add encoding map | 
 |     if decoding_table_code: | 
 |         l.append(''' | 
 | ### Encoding table | 
 | encoding_table=codecs.charmap_build(decoding_table) | 
 | ''') | 
 |     else: | 
 |         l.append(''' | 
 | ### Encoding Map | 
 | ''') | 
 |         l.extend(encoding_map_code) | 
 |  | 
 |     # Final new-line | 
 |     l.append('') | 
 |  | 
 |     return '\n'.join(l).expandtabs() | 
 |  | 
 | def pymap(name,map,pyfile,encodingname,comments=1): | 
 |  | 
 |     code = codegen(name,map,encodingname,comments) | 
 |     f = open(pyfile,'w') | 
 |     f.write(code) | 
 |     f.close() | 
 |  | 
 | def marshalmap(name,map,marshalfile): | 
 |  | 
 |     d = {} | 
 |     for e,(u,c) in map.items(): | 
 |         d[e] = (u,c) | 
 |     f = open(marshalfile,'wb') | 
 |     marshal.dump(d,f) | 
 |     f.close() | 
 |  | 
 | def convertdir(dir, dirprefix='', nameprefix='', comments=1): | 
 |  | 
 |     mapnames = os.listdir(dir) | 
 |     for mapname in mapnames: | 
 |         mappathname = os.path.join(dir, mapname) | 
 |         if not os.path.isfile(mappathname): | 
 |             continue | 
 |         name = os.path.split(mapname)[1] | 
 |         name = name.replace('-','_') | 
 |         name = name.split('.')[0] | 
 |         name = name.lower() | 
 |         name = nameprefix + name | 
 |         codefile = name + '.py' | 
 |         marshalfile = name + '.mapping' | 
 |         print 'converting %s to %s and %s' % (mapname, | 
 |                                               dirprefix + codefile, | 
 |                                               dirprefix + marshalfile) | 
 |         try: | 
 |             map = readmap(os.path.join(dir,mapname)) | 
 |             if not map: | 
 |                 print '* map is empty; skipping' | 
 |             else: | 
 |                 pymap(mappathname, map, dirprefix + codefile,name,comments) | 
 |                 marshalmap(mappathname, map, dirprefix + marshalfile) | 
 |         except ValueError, why: | 
 |             print '* conversion failed: %s' % why | 
 |             raise | 
 |  | 
 | def rewritepythondir(dir, dirprefix='', comments=1): | 
 |  | 
 |     mapnames = os.listdir(dir) | 
 |     for mapname in mapnames: | 
 |         if not mapname.endswith('.mapping'): | 
 |             continue | 
 |         name = mapname[:-len('.mapping')] | 
 |         codefile = name + '.py' | 
 |         print 'converting %s to %s' % (mapname, | 
 |                                        dirprefix + codefile) | 
 |         try: | 
 |             map = marshal.load(open(os.path.join(dir,mapname), | 
 |                                'rb')) | 
 |             if not map: | 
 |                 print '* map is empty; skipping' | 
 |             else: | 
 |                 pymap(mapname, map, dirprefix + codefile,name,comments) | 
 |         except ValueError, why: | 
 |             print '* conversion failed: %s' % why | 
 |  | 
 | if __name__ == '__main__': | 
 |  | 
 |     import sys | 
 |     if 1: | 
 |         convertdir(*sys.argv[1:]) | 
 |     else: | 
 |         rewritepythondir(*sys.argv[1:]) |