| """ Unicode Mapping Parser and Codec Generator. | 
 |  | 
 | This script parses Unicode mapping files as available from the Unicode | 
 | site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec | 
 | modules from them. The codecs use the standard character mapping codec | 
 | to actually apply the mapping. | 
 |  | 
 | Synopsis: gencodec.py dir codec_prefix | 
 |  | 
 | All files in dir are scanned and those producing non-empty mappings | 
 | will be written to <codec_prefix><mapname>.py with <mapname> being the | 
 | first part of the map's filename ('a' in a.b.c.txt) converted to | 
 | lowercase with hyphens replaced by underscores. | 
 |  | 
 | The tool also writes marshalled versions of the mapping tables to the | 
 | same location (with .mapping extension). | 
 |  | 
 | Written by Marc-Andre Lemburg (mal@lemburg.com). | 
 |  | 
 | (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. | 
 | (c) Copyright Guido van Rossum, 2000. | 
 |  | 
 | """#" | 
 |  | 
 | import string,re,os,time,marshal | 
 |  | 
 | # Create numeric tables or character based ones ? | 
 | numeric = 1 | 
 |  | 
 | mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)' | 
 |                    '\s+' | 
 |                    '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)' | 
 |                    '\s*' | 
 |                    '(#.+)?') | 
 |  | 
 | def parsecodes(codes, | 
 |  | 
 |                split=string.split,atoi=string.atoi,len=len, | 
 |                filter=filter,range=range): | 
 |  | 
 |     """ Converts code combinations to either a single code integer | 
 |         or a tuple of integers. | 
 |  | 
 |         meta-codes (in angular brackets, e.g. <LR> and <RL>) are | 
 |         ignored. | 
 |  | 
 |         Empty codes or illegal ones are returned as None. | 
 |  | 
 |     """ | 
 |     if not codes: | 
 |         return None | 
 |     l = split(codes,'+') | 
 |     if len(l) == 1: | 
 |         return atoi(l[0],16) | 
 |     for i in range(len(l)): | 
 |         try: | 
 |             l[i] = atoi(l[i],16) | 
 |         except ValueError: | 
 |             l[i] = None | 
 |     l = filter(lambda x: x is not None, l) | 
 |     if len(l) == 1: | 
 |         return l[0] | 
 |     else: | 
 |         return tuple(l) | 
 |  | 
 | def readmap(filename, | 
 |  | 
 |             strip=string.strip): | 
 |  | 
 |     f = open(filename,'r') | 
 |     lines = f.readlines() | 
 |     f.close() | 
 |     enc2uni = {} | 
 |     identity = [] | 
 |     unmapped = range(256) | 
 |     for i in range(256): | 
 |         unmapped[i] = i | 
 |     for line in lines: | 
 |         line = strip(line) | 
 |         if not line or line[0] == '#': | 
 |             continue | 
 |         m = mapRE.match(line) | 
 |         if not m: | 
 |             #print '* not matched: %s' % repr(line) | 
 |             continue | 
 |         enc,uni,comment = m.groups() | 
 |         enc = parsecodes(enc) | 
 |         uni = parsecodes(uni) | 
 |         if not comment: | 
 |             comment = '' | 
 |         else: | 
 |             comment = comment[1:] | 
 |         if enc < 256: | 
 |             unmapped.remove(enc) | 
 |             if enc == uni: | 
 |                 identity.append(enc) | 
 |             else: | 
 |                 enc2uni[enc] = (uni,comment) | 
 |         else: | 
 |             enc2uni[enc] = (uni,comment) | 
 |     # If there are more identity-mapped entries than unmapped entries, | 
 |     # it pays to generate an identity dictionary first, add add explicit | 
 |     # mappings to None for the rest | 
 |     if len(identity)>=len(unmapped): | 
 |         for enc in unmapped: | 
 |             enc2uni[enc] = (None, "") | 
 |         enc2uni['IDENTITY'] = 256 | 
 |  | 
 |     return enc2uni | 
 |  | 
 | def hexrepr(t, | 
 |  | 
 |             join=string.join): | 
 |  | 
 |     if t is None: | 
 |         return 'None' | 
 |     try: | 
 |         len(t) | 
 |     except: | 
 |         return '0x%04x' % t | 
 |     return '(' + join(map(lambda t: '0x%04x' % t, t),', ') + ')' | 
 |  | 
 | def unicoderepr(t, | 
 |  | 
 |                 join=string.join): | 
 |  | 
 |     if t is None: | 
 |         return 'None' | 
 |     if numeric: | 
 |         return hexrepr(t) | 
 |     else: | 
 |         try: | 
 |             len(t) | 
 |         except: | 
 |             return repr(unichr(t)) | 
 |         return repr(join(map(unichr, t),'')) | 
 |  | 
 | def keyrepr(t, | 
 |  | 
 |             join=string.join): | 
 |  | 
 |     if t is None: | 
 |         return 'None' | 
 |     if numeric: | 
 |         return hexrepr(t) | 
 |     else: | 
 |         try: | 
 |             len(t) | 
 |         except: | 
 |             if t < 256: | 
 |                 return repr(chr(t)) | 
 |             else: | 
 |                 return repr(unichr(t)) | 
 |         return repr(join(map(chr, t),'')) | 
 |  | 
 | def codegen(name,map,comments=1): | 
 |  | 
 |     """ Returns Python source for the given map. | 
 |  | 
 |         Comments are included in the source, if comments is true (default). | 
 |  | 
 |     """ | 
 |     l = [ | 
 |         '''\ | 
 | """ Python Character Mapping Codec generated from '%s' with gencodec.py. | 
 |  | 
 | Written by Marc-Andre Lemburg (mal@lemburg.com). | 
 |  | 
 | (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. | 
 | (c) Copyright 2000 Guido van Rossum. | 
 |  | 
 | """#" | 
 |  | 
 | import codecs | 
 |  | 
 | ### Codec APIs | 
 |  | 
 | class Codec(codecs.Codec): | 
 |  | 
 |     def encode(self,input,errors='strict'): | 
 |  | 
 |         return codecs.charmap_encode(input,errors,encoding_map) | 
 |  | 
 |     def decode(self,input,errors='strict'): | 
 |  | 
 |         return codecs.charmap_decode(input,errors,decoding_map) | 
 |  | 
 | class StreamWriter(Codec,codecs.StreamWriter): | 
 |     pass | 
 |  | 
 | class StreamReader(Codec,codecs.StreamReader): | 
 |     pass | 
 |  | 
 | ### encodings module API | 
 |  | 
 | def getregentry(): | 
 |  | 
 |     return (Codec().encode,Codec().decode,StreamReader,StreamWriter) | 
 |  | 
 | ### Decoding Map | 
 | ''' % name, | 
 |         ] | 
 |  | 
 |     if map.has_key("IDENTITY"): | 
 |         l.append("decoding_map = codecs.make_identity_dict(range(%d))" | 
 |                  % map["IDENTITY"]) | 
 |         l.append("decoding_map.update({") | 
 |         splits = 1 | 
 |         del map["IDENTITY"] | 
 |     else: | 
 |         l.append("decoding_map = {") | 
 |         splits = 0 | 
 |  | 
 |     mappings = map.items() | 
 |     mappings.sort() | 
 |     append = l.append | 
 |     i = 0 | 
 |     for e,value in mappings: | 
 |         try: | 
 |             (u,c) = value | 
 |         except TypeError: | 
 |             u = value | 
 |             c = '' | 
 |         key = keyrepr(e) | 
 |         if c and comments: | 
 |             append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c)) | 
 |         else: | 
 |             append('\t%s: %s,' % (key,unicoderepr(u))) | 
 |         i += 1 | 
 |         if i == 4096: | 
 |             # Split the definition into parts to that the Python | 
 |             # parser doesn't dump core | 
 |             if splits == 0: | 
 |                 append('}') | 
 |             else: | 
 |                 append('})') | 
 |             append('decoding_map.update({') | 
 |             i = 0 | 
 |             splits = splits + 1 | 
 |     if splits == 0: | 
 |         append('}') | 
 |     else: | 
 |         append('})') | 
 |     append(''' | 
 | ### Encoding Map | 
 |  | 
 | encoding_map = {} | 
 | for k,v in decoding_map.items(): | 
 |     encoding_map[v] = k | 
 | ''') | 
 |     return string.join(l,'\n') | 
 |  | 
 | def pymap(name,map,pyfile,comments=1): | 
 |  | 
 |     code = codegen(name,map,comments) | 
 |     f = open(pyfile,'w') | 
 |     f.write(code) | 
 |     f.close() | 
 |  | 
 | def marshalmap(name,map,marshalfile): | 
 |  | 
 |     d = {} | 
 |     for e,(u,c) in map.items(): | 
 |         d[e] = (u,c) | 
 |     f = open(marshalfile,'wb') | 
 |     marshal.dump(d,f) | 
 |     f.close() | 
 |  | 
 | def convertdir(dir,prefix='',comments=1): | 
 |  | 
 |     mapnames = os.listdir(dir) | 
 |     for mapname in mapnames: | 
 |         name = os.path.split(mapname)[1] | 
 |         name = string.replace(name,'-','_') | 
 |         name = string.split(name, '.')[0] | 
 |         name = string.lower(name) | 
 |         codefile = name + '.py' | 
 |         marshalfile = name + '.mapping' | 
 |         print 'converting %s to %s and %s' % (mapname, | 
 |                                               prefix + codefile, | 
 |                                               prefix + marshalfile) | 
 |         try: | 
 |             map = readmap(os.path.join(dir,mapname)) | 
 |             if not map: | 
 |                 print '* map is empty; skipping' | 
 |             else: | 
 |                 pymap(mapname, map, prefix + codefile,comments) | 
 |                 marshalmap(mapname, map, prefix + marshalfile) | 
 |         except ValueError: | 
 |             print '* conversion failed' | 
 |  | 
 | def rewritepythondir(dir,prefix='',comments=1): | 
 |  | 
 |     mapnames = os.listdir(dir) | 
 |     for mapname in mapnames: | 
 |         if not mapname.endswith('.mapping'): | 
 |             continue | 
 |         codefile = mapname[:-len('.mapping')] + '.py' | 
 |         print 'converting %s to %s' % (mapname, | 
 |                                        prefix + codefile) | 
 |         try: | 
 |             map = marshal.load(open(os.path.join(dir,mapname), | 
 |                                'rb')) | 
 |             if not map: | 
 |                 print '* map is empty; skipping' | 
 |             else: | 
 |                 pymap(mapname, map, prefix + codefile,comments) | 
 |         except ValueError, why: | 
 |             print '* conversion failed: %s' % why | 
 |  | 
 | if __name__ == '__main__': | 
 |  | 
 |     import sys | 
 |     if 1: | 
 |         apply(convertdir,tuple(sys.argv[1:])) | 
 |     else: | 
 |         apply(rewritepythondir,tuple(sys.argv[1:])) |