Moved gencodec.py to the Tools/unicode/ directory.
Added new support for decoding tables.
Cleaned up the implementation a bit.
diff --git a/Tools/scripts/gencodec.py b/Tools/scripts/gencodec.py
deleted file mode 100644
index 75337d6..0000000
--- a/Tools/scripts/gencodec.py
+++ /dev/null
@@ -1,300 +0,0 @@
-""" Unicode Mapping Parser and Codec Generator.
-
-This script parses Unicode mapping files as available from the Unicode
-site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
-modules from them. The codecs use the standard character mapping codec
-to actually apply the mapping.
-
-Synopsis: gencodec.py dir codec_prefix
-
-All files in dir are scanned and those producing non-empty mappings
-will be written to <codec_prefix><mapname>.py with <mapname> being the
-first part of the map's filename ('a' in a.b.c.txt) converted to
-lowercase with hyphens replaced by underscores.
-
-The tool also writes marshalled versions of the mapping tables to the
-same location (with .mapping extension).
-
-Written by Marc-Andre Lemburg (mal@lemburg.com).
-
-(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
-(c) Copyright Guido van Rossum, 2000.
-
-"""#"
-
-import re,os,time,marshal
-
-# Create numeric tables or character based ones ?
-numeric = 1
-
-mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
- '\s+'
- '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
- '\s*'
- '(#.+)?')
-
-def parsecodes(codes,
- len=len, filter=filter,range=range):
-
- """ Converts code combinations to either a single code integer
- or a tuple of integers.
-
- meta-codes (in angular brackets, e.g. <LR> and <RL>) are
- ignored.
-
- Empty codes or illegal ones are returned as None.
-
- """
- if not codes:
- return None
- l = codes.split('+')
- if len(l) == 1:
- return int(l[0],16)
- for i in range(len(l)):
- try:
- l[i] = int(l[i],16)
- except ValueError:
- l[i] = None
- l = filter(lambda x: x is not None, l)
- if len(l) == 1:
- return l[0]
- else:
- return tuple(l)
-
-def readmap(filename):
-
- f = open(filename,'r')
- lines = f.readlines()
- f.close()
- enc2uni = {}
- identity = []
- unmapped = range(256)
- for i in range(256):
- unmapped[i] = i
- for line in lines:
- line = line.strip()
- if not line or line[0] == '#':
- continue
- m = mapRE.match(line)
- if not m:
- #print '* not matched: %s' % repr(line)
- continue
- enc,uni,comment = m.groups()
- enc = parsecodes(enc)
- uni = parsecodes(uni)
- if not comment:
- comment = ''
- else:
- comment = comment[1:]
- if enc < 256:
- unmapped.remove(enc)
- if enc == uni:
- identity.append(enc)
- else:
- enc2uni[enc] = (uni,comment)
- else:
- enc2uni[enc] = (uni,comment)
- # If there are more identity-mapped entries than unmapped entries,
- # it pays to generate an identity dictionary first, and add explicit
- # mappings to None for the rest
- if len(identity)>=len(unmapped):
- for enc in unmapped:
- enc2uni[enc] = (None, "")
- enc2uni['IDENTITY'] = 256
-
- return enc2uni
-
-def hexrepr(t):
-
- if t is None:
- return 'None'
- try:
- len(t)
- except:
- return '0x%04x' % t
- return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')'
-
-def unicoderepr(t):
-
- if t is None:
- return 'None'
- if numeric:
- return hexrepr(t)
- else:
- try:
- len(t)
- except:
- return repr(unichr(t))
- return repr(''.join(map(unichr, t)))
-
-def keyrepr(t):
-
- if t is None:
- return 'None'
- if numeric:
- return hexrepr(t)
- else:
- try:
- len(t)
- except:
- if t < 256:
- return repr(chr(t))
- else:
- return repr(unichr(t))
- return repr(''.join(map(chr, t)))
-
-def codegen(name,map,comments=1):
-
- """ Returns Python source for the given map.
-
- Comments are included in the source, if comments is true (default).
-
- """
- l = [
- '''\
-""" Python Character Mapping Codec generated from '%s' with gencodec.py.
-
-"""#"
-
-import codecs
-
-### Codec APIs
-
-class Codec(codecs.Codec):
-
- def encode(self,input,errors='strict'):
-
- return codecs.charmap_encode(input,errors,encoding_map)
-
- def decode(self,input,errors='strict'):
-
- return codecs.charmap_decode(input,errors,decoding_map)
-
-class StreamWriter(Codec,codecs.StreamWriter):
- pass
-
-class StreamReader(Codec,codecs.StreamReader):
- pass
-
-### encodings module API
-
-def getregentry():
-
- return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
-
-### Decoding Map
-''' % name,
- ]
-
- if map.has_key("IDENTITY"):
- l.append("decoding_map = codecs.make_identity_dict(range(%d))"
- % map["IDENTITY"])
- l.append("decoding_map.update({")
- splits = 1
- del map["IDENTITY"]
- else:
- l.append("decoding_map = {")
- splits = 0
-
- mappings = map.items()
- mappings.sort()
- append = l.append
- i = 0
- for e,value in mappings:
- try:
- (u,c) = value
- except TypeError:
- u = value
- c = ''
- key = keyrepr(e)
- if c and comments:
- append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c))
- else:
- append('\t%s: %s,' % (key,unicoderepr(u)))
- i += 1
- if i == 4096:
- # Split the definition into parts to that the Python
- # parser doesn't dump core
- if splits == 0:
- append('}')
- else:
- append('})')
- append('decoding_map.update({')
- i = 0
- splits = splits + 1
- if splits == 0:
- append('}')
- else:
- append('})')
- append('''
-### Encoding Map
-
-encoding_map = codecs.make_encoding_map(decoding_map)
-''')
- return '\n'.join(l)
-
-def pymap(name,map,pyfile,comments=1):
-
- code = codegen(name,map,comments)
- f = open(pyfile,'w')
- f.write(code)
- f.close()
-
-def marshalmap(name,map,marshalfile):
-
- d = {}
- for e,(u,c) in map.items():
- d[e] = (u,c)
- f = open(marshalfile,'wb')
- marshal.dump(d,f)
- f.close()
-
-def convertdir(dir,prefix='',comments=1):
-
- mapnames = os.listdir(dir)
- for mapname in mapnames:
- name = os.path.split(mapname)[1]
- name = name.replace('-','_')
- name = name.split('.')[0]
- name = name.lower()
- codefile = name + '.py'
- marshalfile = name + '.mapping'
- print 'converting %s to %s and %s' % (mapname,
- prefix + codefile,
- prefix + marshalfile)
- try:
- map = readmap(os.path.join(dir,mapname))
- if not map:
- print '* map is empty; skipping'
- else:
- pymap(mapname, map, prefix + codefile,comments)
- marshalmap(mapname, map, prefix + marshalfile)
- except ValueError:
- print '* conversion failed'
-
-def rewritepythondir(dir,prefix='',comments=1):
-
- mapnames = os.listdir(dir)
- for mapname in mapnames:
- if not mapname.endswith('.mapping'):
- continue
- codefile = mapname[:-len('.mapping')] + '.py'
- print 'converting %s to %s' % (mapname,
- prefix + codefile)
- try:
- map = marshal.load(open(os.path.join(dir,mapname),
- 'rb'))
- if not map:
- print '* map is empty; skipping'
- else:
- pymap(mapname, map, prefix + codefile,comments)
- except ValueError, why:
- print '* conversion failed: %s' % why
-
-if __name__ == '__main__':
-
- import sys
- if 1:
- apply(convertdir,tuple(sys.argv[1:]))
- else:
- apply(rewritepythondir,tuple(sys.argv[1:]))
diff --git a/Tools/unicode/gencodec.py b/Tools/unicode/gencodec.py
new file mode 100644
index 0000000..7bce3d5
--- /dev/null
+++ b/Tools/unicode/gencodec.py
@@ -0,0 +1,391 @@
+""" Unicode Mapping Parser and Codec Generator.
+
+This script parses Unicode mapping files as available from the Unicode
+site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
+modules from them. The codecs use the standard character mapping codec
+to actually apply the mapping.
+
+Synopsis: gencodec.py dir codec_prefix
+
+All files in dir are scanned and those producing non-empty mappings
+will be written to <codec_prefix><mapname>.py with <mapname> being the
+first part of the map's filename ('a' in a.b.c.txt) converted to
+lowercase with hyphens replaced by underscores.
+
+The tool also writes marshalled versions of the mapping tables to the
+same location (with .mapping extension).
+
+Written by Marc-Andre Lemburg (mal@lemburg.com). Modified to generate
+Unicode table maps for decoding.
+
+(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
+(c) Copyright Guido van Rossum, 2000.
+(c) Copyright Marc-Andre Lemburg, 2005.
+
+"""#"
+
+import re, os, time, marshal, codecs
+
+# Maximum allowed size of charmap tables
+MAX_TABLE_SIZE = 8192
+
+# Standard undefined Unicode code point
+UNI_UNDEFINED = unichr(0xFFFE)
+
+mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
+ '\s+'
+ '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
+ '\s*'
+ '(#.+)?')
+
+def parsecodes(codes,
+ len=len, filter=filter,range=range):
+
+ """ Converts code combinations to either a single code integer
+ or a tuple of integers.
+
+ meta-codes (in angular brackets, e.g. <LR> and <RL>) are
+ ignored.
+
+ Empty codes or illegal ones are returned as None.
+
+ """
+ if not codes:
+ return None
+ l = codes.split('+')
+ if len(l) == 1:
+ return int(l[0],16)
+ for i in range(len(l)):
+ try:
+ l[i] = int(l[i],16)
+ except ValueError:
+ l[i] = None
+ l = filter(lambda x: x is not None, l)
+ if len(l) == 1:
+ return l[0]
+ else:
+ return tuple(l)
+
+def readmap(filename):
+
+ f = open(filename,'r')
+ lines = f.readlines()
+ f.close()
+ enc2uni = {}
+ identity = []
+ unmapped = range(256)
+
+ # UTC mapping tables per convention don't include the identity
+ # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
+ # explicitly mapped to different characters or undefined
+ for i in range(32) + [127]:
+ identity.append(i)
+ unmapped.remove(i)
+ enc2uni[i] = (i, 'CONTROL CHARACTER')
+
+ for line in lines:
+ line = line.strip()
+ if not line or line[0] == '#':
+ continue
+ m = mapRE.match(line)
+ if not m:
+ #print '* not matched: %s' % repr(line)
+ continue
+ enc,uni,comment = m.groups()
+ enc = parsecodes(enc)
+ uni = parsecodes(uni)
+ if comment is None:
+ comment = ''
+ else:
+ comment = comment[1:].strip()
+ if enc < 256:
+ if enc in unmapped:
+ unmapped.remove(enc)
+ if enc == uni:
+ identity.append(enc)
+ enc2uni[enc] = (uni,comment)
+ else:
+ enc2uni[enc] = (uni,comment)
+
+ # If there are more identity-mapped entries than unmapped entries,
+ # it pays to generate an identity dictionary first, and add explicit
+ # mappings to None for the rest
+ if len(identity) >= len(unmapped):
+ for enc in unmapped:
+ enc2uni[enc] = (None, "")
+ enc2uni['IDENTITY'] = 256
+
+ return enc2uni
+
+def hexrepr(t):
+
+ if t is None:
+ return 'None'
+ try:
+ len(t)
+ except:
+ return '0x%04x' % t
+ try:
+ return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')'
+ except TypeError, why:
+ print '* failed to convert %r: %s' % (t, why)
+ raise
+
+def python_mapdef_code(varname, map, comments=1):
+
+ l = []
+ append = l.append
+ if map.has_key("IDENTITY"):
+ append("%s = codecs.make_identity_dict(range(%d))" %
+ (varname, map["IDENTITY"]))
+ append("%s.update({" % varname)
+ splits = 1
+ del map["IDENTITY"]
+ identity = 1
+ else:
+ append("%s = {" % varname)
+ splits = 0
+ identity = 0
+
+ mappings = map.items()
+ mappings.sort()
+ i = 0
+ for mapkey, mapvalue in mappings:
+ mapcomment = ''
+ if isinstance(mapkey, tuple):
+ (mapkey, mapcomment) = mapkey
+ if isinstance(mapvalue, tuple):
+ (mapvalue, mapcomment) = mapvalue
+ if mapkey is None:
+ continue
+ if (identity and
+ mapkey == mapvalue and
+ mapkey < 256):
+ # No need to include identity mappings, since these
+ # are already set for the first 256 code points.
+ continue
+ key = hexrepr(mapkey)
+ value = hexrepr(mapvalue)
+ if mapcomment and comments:
+ append(' %s: %s,\t# %s' % (key, value, mapcomment))
+ else:
+ append(' %s: %s,' % (key, value))
+ i += 1
+ if i == 4096:
+ # Split the definition into parts to that the Python
+ # parser doesn't dump core
+ if splits == 0:
+ append('}')
+ else:
+ append('})')
+ append('%s.update({' % varname)
+ i = 0
+ splits = splits + 1
+ if splits == 0:
+ append('}')
+ else:
+ append('})')
+
+ return l
+
+def python_tabledef_code(varname, map, comments=1):
+
+ l = []
+ append = l.append
+ append('%s = (' % varname)
+
+ # Analyze map and create table dict
+ mappings = map.items()
+ mappings.sort()
+ table = {}
+ maxkey = 0
+ if map.has_key('IDENTITY'):
+ for key in range(256):
+ table[key] = (key, '')
+ maxkey = 255
+ del map['IDENTITY']
+ for mapkey, mapvalue in mappings:
+ mapcomment = ''
+ if isinstance(mapkey, tuple):
+ (mapkey, mapcomment) = mapkey
+ if isinstance(mapvalue, tuple):
+ (mapvalue, mapcomment) = mapvalue
+ if mapkey is None:
+ continue
+ table[mapkey] = (mapvalue, mapcomment)
+ if mapkey > maxkey:
+ maxkey = mapkey
+ if maxkey > MAX_TABLE_SIZE:
+ # Table too large
+ return None
+
+ # Create table code
+ for key in range(maxkey + 1):
+ if key not in table:
+ mapvalue = None
+ mapcomment = 'UNDEFINED'
+ else:
+ mapvalue, mapcomment = table[key]
+ if mapvalue is None:
+ mapchar = UNI_UNDEFINED
+ else:
+ if isinstance(mapvalue, tuple):
+ # 1-n mappings not supported
+ return None
+ else:
+ mapchar = unichr(mapvalue)
+ if mapcomment and comments:
+ append(' %r\t# %s -> %s' % (mapchar,
+ hexrepr(key),
+ mapcomment))
+ else:
+ append(' %r' % mapchar)
+
+ append(')')
+ return l
+
+def codegen(name, map, comments=1):
+
+ """ Returns Python source for the given map.
+
+ Comments are included in the source, if comments is true (default).
+
+ """
+ # Generate code
+ decoding_map_code = python_mapdef_code(
+ 'decoding_map',
+ map,
+ comments=comments)
+ decoding_table_code = python_tabledef_code(
+ 'decoding_table',
+ map,
+ comments=comments)
+ encoding_map_code = python_mapdef_code(
+ 'encoding_map',
+ codecs.make_encoding_map(map),
+ comments=comments)
+
+ l = [
+ '''\
+""" Python Character Mapping Codec generated from '%s' with gencodec.py.
+
+"""#"
+
+import codecs
+
+### Codec APIs
+
+class Codec(codecs.Codec):
+
+ def encode(self,input,errors='strict'):
+
+ return codecs.charmap_encode(input,errors,encoding_map)
+
+ def decode(self,input,errors='strict'):
+''' % name
+ ]
+ if decoding_table_code:
+ l.append('''\
+ return codecs.charmap_decode(input,errors,decoding_table)''')
+ else:
+ l.append('''\
+ return codecs.charmap_decode(input,errors,decoding_map)''')
+
+ l.append('''
+class StreamWriter(Codec,codecs.StreamWriter):
+ pass
+
+class StreamReader(Codec,codecs.StreamReader):
+ pass
+
+### encodings module API
+
+def getregentry():
+
+ return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
+
+### Decoding Map
+''')
+ l.extend(decoding_map_code)
+
+ # Add optional decoding table
+ if decoding_table_code:
+ l.append('''
+### Decoding Table
+''')
+ l.extend(decoding_table_code)
+
+ l.append('''
+### Encoding Map
+''')
+ l.extend(encoding_map_code)
+
+ return '\n'.join(l)
+
+def pymap(name,map,pyfile,comments=1):
+
+ code = codegen(name,map,comments)
+ f = open(pyfile,'w')
+ f.write(code)
+ f.close()
+
+def marshalmap(name,map,marshalfile):
+
+ d = {}
+ for e,(u,c) in map.items():
+ d[e] = (u,c)
+ f = open(marshalfile,'wb')
+ marshal.dump(d,f)
+ f.close()
+
+def convertdir(dir,prefix='',comments=1):
+
+ mapnames = os.listdir(dir)
+ for mapname in mapnames:
+ mappathname = os.path.join(dir, mapname)
+ name = os.path.split(mapname)[1]
+ name = name.replace('-','_')
+ name = name.split('.')[0]
+ name = name.lower()
+ codefile = name + '.py'
+ marshalfile = name + '.mapping'
+ print 'converting %s to %s and %s' % (mapname,
+ prefix + codefile,
+ prefix + marshalfile)
+ try:
+ map = readmap(os.path.join(dir,mapname))
+ if not map:
+ print '* map is empty; skipping'
+ else:
+ pymap(mappathname, map, prefix + codefile,comments)
+ marshalmap(mappathname, map, prefix + marshalfile)
+ except ValueError, why:
+ print '* conversion failed: %s' % why
+ raise
+
+def rewritepythondir(dir,prefix='',comments=1):
+
+ mapnames = os.listdir(dir)
+ for mapname in mapnames:
+ if not mapname.endswith('.mapping'):
+ continue
+ codefile = mapname[:-len('.mapping')] + '.py'
+ print 'converting %s to %s' % (mapname,
+ prefix + codefile)
+ try:
+ map = marshal.load(open(os.path.join(dir,mapname),
+ 'rb'))
+ if not map:
+ print '* map is empty; skipping'
+ else:
+ pymap(mapname, map, prefix + codefile,comments)
+ except ValueError, why:
+ print '* conversion failed: %s' % why
+
+if __name__ == '__main__':
+
+ import sys
+ if 1:
+ apply(convertdir,tuple(sys.argv[1:]))
+ else:
+ apply(rewritepythondir,tuple(sys.argv[1:]))