Marc-Andre Lemburg: added
gencodec.py - Create Python codecs from Unicode mapping files
diff --git a/Tools/scripts/README b/Tools/scripts/README
index db270b8..a86646b 100644
--- a/Tools/scripts/README
+++ b/Tools/scripts/README
@@ -18,6 +18,7 @@
 fixnotice.py		Fix the copyright notice in source files
 fixps.py		Fix Python scripts' first line (if #!)
 ftpmirror.py		FTP mirror script
+gencodec.py		Create Python codecs from Unicode mapping files
 h2py.py			Translate #define's into Python assignments
 ifdef.py		Remove #if(n)def groups from C sources
 lfcr.py			Change LF line endings to CRLF (Unix to Windows)
diff --git a/Tools/scripts/gencodec.py b/Tools/scripts/gencodec.py
new file mode 100644
index 0000000..b5680ee
--- /dev/null
+++ b/Tools/scripts/gencodec.py
@@ -0,0 +1,289 @@
+""" Unicode Mapping Parser and Codec Generator.
+
+This script parses Unicode mapping files as available from the Unicode
+site (ftp.unicode.org) and creates Python codec modules from them. The
+codecs use the standard character mapping codec to actually apply the
+mapping.
+
+Synopsis: gencodec.py dir codec_prefix
+
+All files in dir are scanned and those producing non-empty mappings
+will be written to <codec_prefix><mapname>.py with <mapname> being the
+first part of the map's filename ('a' in a.b.c.txt) converted to
+lowercase with hyphens replaced by underscores.
+
+The tool also write marhsalled versions of the mapping tables to the
+same location (with .mapping extension).
+
+Written by Marc-Andre Lemburg (mal@lemburg.com).
+
+(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
+
+"""#"
+
+import string,re,os,time,marshal
+
+# Create numeric tables or character based ones ?
+numeric = 1
+
+mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
+                   '\s+'
+                   '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
+                   '\s*'
+                   '(#.+)?')
+
+def parsecodes(codes,
+
+               split=string.split,atoi=string.atoi,len=len,
+               filter=filter,range=range):
+
+    """ Converts code combinations to either a single code integer
+        or a tuple of integers.
+
+        meta-codes (in angular brackets, e.g. <LR> and <RL>) are
+        ignored.
+
+        Empty codes or illegal ones are returned as None.
+
+    """
+    if not codes:
+        return None
+    l = split(codes,'+')
+    if len(l) == 1:
+        return atoi(l[0],16)
+    for i in range(len(l)):
+        try:
+            l[i] = atoi(l[i],16)
+        except ValueError:
+            l[i] = None
+    l = filter(lambda x: x is not None, l)
+    if len(l) == 1:
+        return l[0]
+    else:
+        return tuple(l)
+
+def readmap(filename,
+
+            strip=string.strip):
+
+    f = open(filename,'r')
+    lines = f.readlines()
+    f.close()
+    enc2uni = {}
+    for line in lines:
+        line = strip(line)
+        if not line or line[0] == '#':
+            continue
+        m = mapRE.match(line)
+        if not m:
+            #print '* not matched: %s' % repr(line)
+            continue
+        enc,uni,comment = m.groups()
+        enc = parsecodes(enc)
+        uni = parsecodes(uni)
+        if not comment:
+            comment = ''
+        else:
+            comment = comment[1:]
+        if enc != uni:
+            enc2uni[enc] = (uni,comment)
+    return enc2uni
+
+def hexrepr(t,
+
+            join=string.join):
+
+    if t is None:
+        return 'None'
+    try:
+        len(t)
+    except:
+        return '0x%04x' % t
+    return '(' + join(map(lambda t: '0x%04x' % t, t),', ') + ')'
+
+def unicoderepr(t,
+
+                join=string.join):
+
+    if t is None:
+        return 'None'
+    if numeric:
+        return hexrepr(t)
+    else:
+        try:
+            len(t)
+        except:
+            return repr(unichr(t))
+        return repr(join(map(unichr, t),''))
+
+def keyrepr(t,
+
+            join=string.join):
+
+    if t is None:
+        return 'None'
+    if numeric:
+        return hexrepr(t)
+    else:
+        try:
+            len(t)
+        except:
+            if t < 256:
+                return repr(chr(t))
+            else:
+                return repr(unichr(t))
+        return repr(join(map(chr, t),''))
+
+def codegen(name,map,comments=1):
+
+    """ Returns Python source for the given map.
+
+        Comments are included in the source, if comments is true (default).
+
+    """
+    l = [
+        '''\
+""" Python Character Mapping Codec generated from '%s'.
+
+Written by Marc-Andre Lemburg (mal@lemburg.com).
+
+(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
+
+"""#"
+
+import codecs
+
+### Codec APIs
+
+class Codec(codecs.Codec):
+
+    def encode(self,input,errors='strict'):
+
+        return codecs.charmap_encode(input,errors,encoding_map)
+        
+    def decode(self,input,errors='strict'):
+
+        return codecs.charmap_decode(input,errors,decoding_map)
+
+class StreamWriter(Codec,codecs.StreamWriter):
+    pass
+        
+class StreamReader(Codec,codecs.StreamReader):
+    pass
+
+### encodings module API
+
+def getregentry():
+
+    return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
+
+### Decoding Map
+
+decoding_map = {
+''' % name,
+        ]
+    mappings = map.items()
+    mappings.sort()
+    append = l.append
+    i = 0
+    splits = 0
+    for e,value in mappings:
+        try:
+            (u,c) = value
+        except TypeError:
+            u = value
+            c = ''
+        key = keyrepr(e)
+        if c and comments:
+            append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c))
+        else:
+            append('\t%s: %s,' % (key,unicoderepr(u)))
+        i = i + 1
+        if i == 4096:
+            # Split the definition into parts to that the Python
+            # parser doesn't dump core
+            if splits == 0:
+                append('}')
+            else:
+                append('})')
+            append('map.update({')
+            i = 0
+            splits = splits + 1
+    if splits == 0:
+        append('}')
+    else:
+        append('})')
+    append('''
+### Encoding Map
+
+encoding_map = {}
+for k,v in decoding_map.items():
+    encoding_map[v] = k
+''')
+    return string.join(l,'\n')
+
+def pymap(name,map,pyfile,comments=1):
+
+    code = codegen(name,map,comments)
+    f = open(pyfile,'w')
+    f.write(code)
+    f.close()
+
+def marshalmap(name,map,marshalfile):
+
+    d = {}
+    for e,(u,c) in map.items():
+        d[e] = (u,c)
+    f = open(marshalfile,'wb')
+    marshal.dump(d,f)
+    f.close()
+
+def convertdir(dir,prefix='',comments=1):
+
+    mapnames = os.listdir(dir)
+    for mapname in mapnames:
+        name = os.path.split(mapname)[1]
+        name = string.replace(name,'-','_')
+        name = string.split(name, '.')[0]
+        name = string.lower(name)
+        codefile = name + '.py'
+        marshalfile = name + '.mapping'
+        print 'converting %s to %s and %s' % (mapname,
+                                              prefix + codefile,
+                                              prefix + marshalfile)
+        try:
+            map = readmap(os.path.join(dir,mapname))
+            if not map:
+                print '* map is empty; skipping'
+            else:
+                pymap(mapname, map, prefix + codefile,comments)
+                marshalmap(mapname, map, prefix + marshalfile)
+        except ValueError:
+            print '* conversion failed'
+
+def rewritepythondir(dir,prefix='',comments=1):
+    
+    mapnames = os.listdir(dir)
+    for mapname in mapnames:
+        if mapname[-len('.mapping'):] != '.mapping':
+            continue
+        codefile = mapname[:-len('.mapping')] + '.py'
+        print 'converting %s to %s' % (mapname,
+                                       prefix + codefile)
+        try:
+            map = marshal.load(open(os.path.join(dir,mapname),
+                               'rb'))
+            if not map:
+                print '* map is empty; skipping'
+            else:
+                pymap(mapname, map, prefix + codefile,comments)
+        except ValueError, why:
+            print '* conversion failed: %s' % why
+
+if __name__ == '__main__':
+
+    import sys
+    if 1:
+        apply(convertdir,tuple(sys.argv[1:]))
+    else:
+        apply(rewritepythondir,tuple(sys.argv[1:]))