Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython2

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

4

# this script converts a unicode 3.2 database file to

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

5

# Modules/unicodedata_db.h, Modules/unicodename_db.h,

6

# and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

7

#

8

# history:

9

# 2000-09-24 fl created (based on bits and pieces from unidb)

10

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

11

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

12

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

13

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

14

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

15

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

16

# 2002-09-11 wd use string methods

17

# 2002-10-18 mvl update to Unicode 3.2

18

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

19

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

20

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

21

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

22

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

23

# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

24

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

25

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

26

#

27

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

28

import sys, os, zipfile

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

29

30

SCRIPT = sys.argv[0]

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

31

VERSION = "3.2"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

32

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

33

# The Unicode Database

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

34

UNIDATA_VERSION = "6.0.0"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

35

UNICODE_DATA = "UnicodeData%s.txt"

36

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

37

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

38

UNIHAN = "Unihan%s.zip"

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

39

DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

40

DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

41

LINE_BREAK = "LineBreak%s.txt"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

42

43

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

44

45

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

46

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

47

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

48

"So" ]

49

50

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

51

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

52

"ON" ]

53

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

54

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

55

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

56

MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]

57

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

58

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

63

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

64

SPACE_MASK = 0x20

65

TITLE_MASK = 0x40

66

UPPER_MASK = 0x80

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

67

XID_START_MASK = 0x100

68

XID_CONTINUE_MASK = 0x200

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

69

PRINTABLE_MASK = 0x400

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

70

NODELTA_MASK = 0x800

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

71

NUMERIC_MASK = 0x1000

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

72

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

73

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

74

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

75

print("--- Reading", UNICODE_DATA % "", "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

76

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

77

version = ""

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

78

unicode = UnicodeData(UNIDATA_VERSION)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

79

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

80

print(len(list(filter(None, unicode.table))), "characters")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

81

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

82

for version in old_versions:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

83

print("--- Reading", UNICODE_DATA % ("-"+version), "...")

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

84

old_unicode = UnicodeData(version)

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

85

print(len(list(filter(None, old_unicode.table))), "characters")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

86

merge_old_version(version, unicode, old_unicode)

87

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

88

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

89

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

90

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

91

92

# --------------------------------------------------------------------

93

# unicode character properties

94

95

def makeunicodedata(unicode, trace):

96

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

97

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

98

table = [dummy]

99

cache = {0: dummy}

100

index = [0] * len(unicode.chars)

101

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

102

FILE = "Modules/unicodedata_db.h"

103

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

104

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

105

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

106

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

107

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

108

for char in unicode.chars:

109

record = unicode.table[char]

110

if record:

111

# extract database properties

112

category = CATEGORY_NAMES.index(record[2])

113

combining = int(record[3])

114

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

115

mirrored = record[9] == "Y"

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

116

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

117

normalizationquickcheck = record[17]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

118

item = (

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

119

category, combining, bidirectional, mirrored, eastasianwidth,

120

normalizationquickcheck

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

121

)

122

# add entry to index and item tables

123

i = cache.get(item)

124

if i is None:

125

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

129

# 2) decomposition data

130

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

131

decomp_data = [0]

132

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

133

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

134

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

135

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

136

comp_pairs = []

137

comp_first = [None] * len(unicode.chars)

138

comp_last = [None] * len(unicode.chars)

139

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

140

for char in unicode.chars:

141

record = unicode.table[char]

142

if record:

143

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

144

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

145

if len(decomp) > 19:

Collin Winter

2007-08-22 23:05:06 +0000

[diff] [blame]

146

raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

147

# prefix

148

if decomp[0][0] == "<":

149

prefix = decomp.pop(0)

150

else:

151

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

152

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

153

i = decomp_prefix.index(prefix)

154

except ValueError:

155

i = len(decomp_prefix)

156

decomp_prefix.append(prefix)

157

prefix = i

158

assert prefix < 256

159

# content

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

160

decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

161

# Collect NFC pairs

162

if not prefix and len(decomp) == 3 and \

163

char not in unicode.exclusions and \

164

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

169

try:

170

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

171

except ValueError:

172

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

173

decomp_data.extend(decomp)

174

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

175

else:

176

i = 0

177

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

178

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

179

f = l = 0

180

comp_first_ranges = []

181

comp_last_ranges = []

182

prev_f = prev_l = None

183

for i in unicode.chars:

184

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

190

prev_f = prev_f[0],i

191

else:

192

comp_first_ranges.append(prev_f)

193

prev_f = (i,i)

194

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

200

prev_l = prev_l[0],i

201

else:

202

comp_last_ranges.append(prev_l)

203

prev_l = (i,i)

204

comp_first_ranges.append(prev_f)

205

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

210

for f,l,char in comp_pairs:

211

f = comp_first[f]

212

l = comp_last[l]

213

comp_data[f*total_last+l] = char

214

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

215

print(len(table), "unique properties")

216

print(len(decomp_prefix), "unique decomposition prefixes")

217

print(len(decomp_data), "unique decomposition entries:", end=' ')

218

print(decomp_size, "bytes")

219

print(total_first, "first characters in NFC")

220

print(total_last, "last characters in NFC")

221

print(len(comp_pairs), "NFC pairs")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

222

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

223

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

224

Fred Drake

9c68505

2000-10-26 03:56:46 +0000

[diff] [blame]

225

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

226

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

227

print(file=fp)

228

print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)

229

print("/* a list of unique database records */", file=fp)

230

print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

231

for item in table:

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

232

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

233

print("};", file=fp)

234

print(file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

235

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

236

print("/* Reindexing of NFC first characters. */", file=fp)

237

print("#define TOTAL_FIRST",total_first, file=fp)

238

print("#define TOTAL_LAST",total_last, file=fp)

239

print("struct reindex{int start;short count,index;};", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

240

print("static struct reindex nfc_first[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

241

for start,end in comp_first_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

242

print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)

243

print(" {0,0,0}", file=fp)

244

print("};\n", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

245

print("static struct reindex nfc_last[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

246

for start,end in comp_last_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

247

print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)

248

print(" {0,0,0}", file=fp)

249

print("};\n", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

250

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

251

# FIXME: <fl> the following tables could be made static, and

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

252

# the support code moved into unicodedatabase.c

253

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

254

print("/* string literals */", file=fp)

255

print("const char *_PyUnicode_CategoryNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

256

for name in CATEGORY_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

257

print(" \"%s\"," % name, file=fp)

258

print(" NULL", file=fp)

259

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

260

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

261

print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

262

for name in BIDIRECTIONAL_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

263

print(" \"%s\"," % name, file=fp)

264

print(" NULL", file=fp)

265

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

266

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

267

print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

268

for name in EASTASIANWIDTH_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

269

print(" \"%s\"," % name, file=fp)

270

print(" NULL", file=fp)

271

print("};", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

272

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

273

print("static const char *decomp_prefix[] = {", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

274

for name in decomp_prefix:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

275

print(" \"%s\"," % name, file=fp)

276

print(" NULL", file=fp)

277

print("};", file=fp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

278

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

279

# split record index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

280

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

281

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

282

print("/* index tables for the database records */", file=fp)

283

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

284

Array("index1", index1).dump(fp, trace)

285

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

286

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

287

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

288

index1, index2, shift = splitbins(decomp_index, trace)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

289

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

290

print("/* decomposition data */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

291

Array("decomp_data", decomp_data).dump(fp, trace)

292

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

293

print("/* index tables for the decomposition data */", file=fp)

294

print("#define DECOMP_SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

295

Array("decomp_index1", index1).dump(fp, trace)

296

Array("decomp_index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

297

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

298

index, index2, shift = splitbins(comp_data, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

299

print("/* NFC pairs */", file=fp)

300

print("#define COMP_SHIFT", shift, file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

301

Array("comp_index", index).dump(fp, trace)

302

Array("comp_data", index2).dump(fp, trace)

303

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

304

# Generate delta tables for old versions

305

for version, table, normalization in unicode.changed:

306

cversion = version.replace(".","_")

307

records = [table[0]]

308

cache = {table[0]:0}

309

index = [0] * len(table)

310

for i, record in enumerate(table):

311

try:

312

index[i] = cache[record]

313

except KeyError:

314

index[i] = cache[record] = len(records)

315

records.append(record)

316

index1, index2, shift = splitbins(index, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

317

print("static const change_record change_records_%s[] = {" % cversion, file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

318

for record in records:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

319

print("\t{ %s }," % ", ".join(map(str,record)), file=fp)

320

print("};", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

321

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

322

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

323

print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)

324

print("{", file=fp)

325

print("\tint index;", file=fp)

326

print("\tif (n >= 0x110000) index = 0;", file=fp)

327

print("\telse {", file=fp)

328

print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)

329

print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \

330

(cversion, shift, ((1<<shift)-1)), file=fp)

331

print("\t}", file=fp)

332

print("\treturn change_records_%s+index;" % cversion, file=fp)

333

print("}\n", file=fp)

334

print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)

335

print("{", file=fp)

336

print("\tswitch(n) {", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

337

for k, v in normalization:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

338

print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)

339

print("\tdefault: return 0;", file=fp)

340

print("\t}\n}\n", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

341

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

342

fp.close()

343

344

# --------------------------------------------------------------------

345

# unicode character type tables

346

347

def makeunicodetype(unicode, trace):

348

349

FILE = "Objects/unicodetype_db.h"

350

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

351

print("--- Preparing", FILE, "...")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

352

353

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

354

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

355

table = [dummy]

356

cache = {0: dummy}

357

index = [0] * len(unicode.chars)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

358

numeric = {}

359

spaces = []

360

linebreaks = []

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

361

362

for char in unicode.chars:

363

record = unicode.table[char]

364

if record:

365

# extract database properties

366

category = record[2]

367

bidirectional = record[4]

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

368

properties = record[16]

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

369

flags = 0

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

370

delta = True

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

371

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

372

flags |= ALPHA_MASK

373

if category == "Ll":

374

flags |= LOWER_MASK

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

375

if 'Line_Break' in properties or bidirectional == "B":

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

376

flags |= LINEBREAK_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

377

linebreaks.append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

378

if category == "Zs" or bidirectional in ("WS", "B", "S"):

379

flags |= SPACE_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

380

spaces.append(char)

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

381

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

382

flags |= TITLE_MASK

383

if category == "Lu":

384

flags |= UPPER_MASK

Benjamin Peterson

0983274

2009-03-26 17:15:46 +0000

[diff] [blame]

385

if char == ord(" ") or category[0] not in ("C", "Z"):

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

386

flags |= PRINTABLE_MASK

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

387

if "XID_Start" in properties:

388

flags |= XID_START_MASK

389

if "XID_Continue" in properties:

390

flags |= XID_CONTINUE_MASK

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

391

# use delta predictor for upper/lower/title if it fits

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

392

if record[12]:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

393

upper = int(record[12], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

394

else:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

395

upper = char

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

396

if record[13]:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

397

lower = int(record[13], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

398

else:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

399

lower = char

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

400

if record[14]:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

401

title = int(record[14], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

402

else:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

403

# UCD.html says that a missing title char means that

404

# it defaults to the uppercase character, not to the

405

# character itself. Apparently, in the current UCD (5.x)

406

# this feature is never used

407

title = upper

408

upper_d = upper - char

409

lower_d = lower - char

410

title_d = title - char

411

if -32768 <= upper_d <= 32767 and \

412

-32768 <= lower_d <= 32767 and \

413

-32768 <= title_d <= 32767:

414

# use deltas

415

upper = upper_d & 0xffff

416

lower = lower_d & 0xffff

417

title = title_d & 0xffff

418

else:

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

419

flags |= NODELTA_MASK

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

420

# decimal digit, integer digit

421

decimal = 0

422

if record[6]:

423

flags |= DECIMAL_MASK

424

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

429

if record[8]:

430

flags |= NUMERIC_MASK

431

numeric.setdefault(record[8], []).append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

432

item = (

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

433

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

434

)

435

# add entry to index and item tables

436

i = cache.get(item)

437

if i is None:

438

cache[item] = i = len(table)

table.append(item)

index[char] = i

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

442

print(len(table), "unique character type entries")

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

443

print(sum(map(len, numeric.values())), "numeric code points")

444

print(len(spaces), "whitespace code points")

445

print(len(linebreaks), "linebreak code points")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

446

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

447

print("--- Writing", FILE, "...")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

448

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

449

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

450

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

451

print(file=fp)

452

print("/* a list of unique character type descriptors */", file=fp)

453

print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

454

for item in table:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

455

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

456

print("};", file=fp)

457

print(file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

458

459

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

460

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

461

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

462

print("/* type indexes */", file=fp)

463

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

464

Array("index1", index1).dump(fp, trace)

465

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

466

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

467

# Generate code for _PyUnicode_ToNumeric()

468

numeric_items = sorted(numeric.items())

469

print('/* Returns the numeric value as double for Unicode characters', file=fp)

470

print(' * having this property, -1.0 otherwise.', file=fp)

471

print(' */', file=fp)

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

472

print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

473

print('{', file=fp)

474

print(' switch (ch) {', file=fp)

475

for value, codepoints in numeric_items:

Amaury Forgeot d'Arc

919765a

2009-10-13 23:18:53 +0000

[diff] [blame]

476

# Turn text into float literals

477

parts = value.split('/')

478

parts = [repr(float(part)) for part in parts]

479

value = '/'.join(parts)

480

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

481

codepoints.sort()

482

for codepoint in codepoints:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

483

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

484

print(' return (double) %s;' % (value,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

485

print(' }', file=fp)

486

print(' return -1.0;', file=fp)

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsWhitespace()

491

print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)

492

print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)

493

print(" */", file=fp)

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

494

print('int _PyUnicode_IsWhitespace(register const Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

495

print('{', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

496

print(' switch (ch) {', file=fp)

497

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

498

for codepoint in sorted(spaces):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

499

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

500

print(' return 1;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

501

502

print(' }', file=fp)

503

print(' return 0;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsLinebreak()

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

508

print("/* Returns 1 for Unicode characters having the line break", file=fp)

509

print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)

510

print(" * type 'B', 0 otherwise.", file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

511

print(" */", file=fp)

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

512

print('int _PyUnicode_IsLinebreak(register const Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

513

print('{', file=fp)

514

print(' switch (ch) {', file=fp)

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

515

for codepoint in sorted(linebreaks):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

516

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

517

print(' return 1;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

518

519

print(' }', file=fp)

520

print(' return 0;', file=fp)

print('}', file=fp)

print(file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

524

fp.close()

525

526

# --------------------------------------------------------------------

527

# unicode name database

528

529

def makeunicodename(unicode, trace):

530

531

FILE = "Modules/unicodename_db.h"

532

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

533

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

534

535

# collect names

536

names = [None] * len(unicode.chars)

537

538

for char in unicode.chars:

539

record = unicode.table[char]

540

if record:

541

name = record[1].strip()

542

if name and name[0] != "<":

543

names[char] = name + chr(0)

544

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

545

print(len(list(n for n in names if n is not None)), "distinct names")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

546

547

# collect unique words from names (note that we differ between

548

# words inside a sentence, and words ending a sentence. the

549

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

565

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

566

print(n, "words in text;", b, "bytes")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

567

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

568

wordlist = list(words.items())

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

569

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

570

# sort on falling frequency, then by name

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

571

def word_key(a):

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

572

aword, alist = a

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

573

return -len(alist), aword

574

wordlist.sort(key=word_key)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

575

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

576

# figure out how many phrasebook escapes we need

577

escapes = 0

578

while escapes * 256 < len(wordlist):

579

escapes = escapes + 1

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

580

print(escapes, "escapes")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

581

582

short = 256 - escapes

assert short > 0

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

586

print(short, "short indexes in lexicon")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

587

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

588

# statistics

589

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

590

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

591

n = n + len(wordlist[i][1])

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

592

print(n, "short indexes in phrasebook")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

593

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

594

# pick the most commonly used words, and sort the rest on falling

595

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

596

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

597

wordlist, wordtail = wordlist[:short], wordlist[short:]

Raymond Hettinger

d4cb56d

2008-01-30 02:55:10 +0000

[diff] [blame]

598

wordtail.sort(key=lambda a: a[0], reverse=True)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

599

wordlist.extend(wordtail)

600

601

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

608

offset = 0

609

for w, x in wordlist:

610

# encoding: bit 7 indicates last character in word (chr(128)

611

# indicates the last character in an entire string)

612

ww = w[:-1] + chr(ord(w[-1])+128)

613

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

614

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

615

if o < 0:

616

o = offset

617

lexicon = lexicon + ww

618

offset = offset + len(w)

619

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

620

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

621

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

622

lexicon = list(map(ord, lexicon))

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

623

624

# generate phrasebook from names and lexicon

625

phrasebook = [0]

626

phrasebook_offset = [0] * len(unicode.chars)

627

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

632

for w in w:

633

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

634

if i < short:

635

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

636

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

637

# store as two bytes

638

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

639

phrasebook.append(i&255)

640

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

641

assert getsize(phrasebook) == 1

642

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

643

#

644

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

649

record = unicode.table[char]

650

if record:

651

name = record[1].strip()

652

if name and name[0] != "<":

653

data.append((name, char))

654

655

# the magic number 47 was chosen to minimize the number of

656

# collisions on the current data set. if you like, change it

657

# and see what happens...

658

659

codehash = Hash("code", data, 47)

660

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

661

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

662

663

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

664

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

665

print(file=fp)

666

print("#define NAME_MAXLEN", 256, file=fp)

667

print(file=fp)

668

print("/* lexicon */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

669

Array("lexicon", lexicon).dump(fp, trace)

670

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

671

672

# split decomposition index table

673

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

674

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

675

print("/* code->name phrasebook */", file=fp)

676

print("#define phrasebook_shift", shift, file=fp)

677

print("#define phrasebook_short", short, file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

678

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

679

Array("phrasebook", phrasebook).dump(fp, trace)

680

Array("phrasebook_offset1", offset1).dump(fp, trace)

681

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

682

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

683

print("/* name->code dictionary */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

684

codehash.dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

fp.close()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

688

689

def merge_old_version(version, new, old):

690

# Changes to exclusion file not implemented yet

691

if old.exclusions != new.exclusions:

Collin Winter

2007-08-22 23:05:06 +0000

[diff] [blame]

692

raise NotImplementedError("exclusions differ")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

693

694

# In these change records, 0xFF means "no change"

695

bidir_changes = [0xFF]*0x110000

696

category_changes = [0xFF]*0x110000

697

decimal_changes = [0xFF]*0x110000

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

698

mirrored_changes = [0xFF]*0x110000

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

699

# In numeric data, 0 means "no change",

700

# -1 means "did not have a numeric value

701

numeric_changes = [0] * 0x110000

702

# normalization_changes is a list of key-value pairs

703

normalization_changes = []

704

for i in range(0x110000):

705

if new.table[i] is None:

706

# Characters unassigned in the new version ought to

707

# be unassigned in the old one

708

assert old.table[i] is None

709

continue

710

# check characters unassigned in the old version

711

if old.table[i] is None:

712

# category 0 is "unassigned"

713

category_changes[i] = 0

714

continue

715

# check characters that differ

716

if old.table[i] != new.table[i]:

717

for k in range(len(old.table[i])):

718

if old.table[i][k] != new.table[i][k]:

719

value = old.table[i][k]

720

if k == 2:

721

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

722

category_changes[i] = CATEGORY_NAMES.index(value)

723

elif k == 4:

724

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

725

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

726

elif k == 5:

727

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

728

# We assume that all normalization changes are in 1:1 mappings

729

assert " " not in value

730

normalization_changes.append((i, value))

731

elif k == 6:

732

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

733

# we only support changes where the old value is a single digit

734

assert value in "0123456789"

735

decimal_changes[i] = int(value)

736

elif k == 8:

737

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

738

# Since 0 encodes "no change", the old value is better not 0

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

739

if not value:

740

numeric_changes[i] = -1

741

else:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

742

numeric_changes[i] = float(value)

743

assert numeric_changes[i] not in (0, -1)

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

744

elif k == 9:

745

if value == 'Y':

746

mirrored_changes[i] = '1'

747

else:

748

mirrored_changes[i] = '0'

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

749

elif k == 11:

750

# change to ISO comment, ignore

751

pass

752

elif k == 12:

753

# change to simple uppercase mapping; ignore

754

pass

755

elif k == 13:

756

# change to simple lowercase mapping; ignore

757

pass

758

elif k == 14:

759

# change to simple titlecase mapping; ignore

760

pass

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

761

elif k == 16:

762

# derived property changes; not yet

763

pass

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

764

elif k == 17:

765

# normalization quickchecks are not performed

766

# for older versions

767

pass

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

768

else:

769

class Difference(Exception):pass

Collin Winter

2007-08-22 23:05:06 +0000

[diff] [blame]

770

raise Difference(hex(i), k, old.table[i], new.table[i])

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

771

new.changed.append((version, list(zip(bidir_changes, category_changes,

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

772

decimal_changes, mirrored_changes,

773

numeric_changes)),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

774

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

775

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

776

def open_data(template, version):

777

local = template % ('-'+version,)

778

if not os.path.exists(local):

779

import urllib.request

780

if version == '3.2.0':

781

# irregular url structure

782

url = 'http://www.unicode.org/Public/3.2-Update/' + local

783

else:

784

url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')

785

urllib.request.urlretrieve(url, filename=local)

786

if local.endswith('.txt'):

787

return open(local, encoding='utf-8')

788

else:

789

# Unihan.zip

790

return open(local, 'rb')

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

791

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

792

# --------------------------------------------------------------------

793

# the following support code is taken from the unidb utilities

794

795

796

# load a unicode-data file from disk

797

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

798

class UnicodeData:

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

799

# Record structure:

800

# [ID, name, category, combining, bidi, decomp, (6)

801

# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)

802

# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)

803

# derived-props] (17)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

804

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

805

def __init__(self, version,

806

linebreakprops=False,

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

807

expand=1):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

808

self.changed = []

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

809

file = open_data(UNICODE_DATA, version)

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

810

table = [None] * 0x110000

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

while 1:

s = file.readline()

if not s:

break

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

815

s = s.strip().split(";")

816

char = int(s[0], 16)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

817

table[char] = s

818

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

819

# expand first-last ranges

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

820

if expand:

821

field = None

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

822

for i in range(0, 0x110000):

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

823

s = table[i]

824

if s:

825

if s[1][-6:] == "First>":

826

s[1] = ""

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

827

field = s

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

828

elif s[1][-5:] == "Last>":

829

s[1] = ""

830

field = None

831

elif field:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

832

f2 = field[:]

833

f2[0] = "%X" % i

834

table[i] = f2

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

835

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

836

# public attributes

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

837

self.filename = UNICODE_DATA % ''

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

838

self.table = table

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

839

self.chars = list(range(0x110000)) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

840

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

841

file = open_data(COMPOSITION_EXCLUSIONS, version)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

self.exclusions = {}

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

char = int(s.split()[0],16)

850

self.exclusions[char] = 1

851

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

852

widths = [None] * 0x110000

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

853

for s in open_data(EASTASIAN_WIDTH, version):

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

s = s.split()[0].split(';')

860

if '..' in s[0]:

861

first, last = [int(c, 16) for c in s[0].split('..')]

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

862

chars = list(range(first, last+1))

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

863

else:

864

chars = [int(s[0], 16)]

865

for char in chars:

866

widths[char] = s[1]

867

for i in range(0, 0x110000):

868

if table[i] is not None:

869

table[i].append(widths[i])

870

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

871

for i in range(0, 0x110000):

872

if table[i] is not None:

873

table[i].append(set())

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

874

for s in open_data(DERIVED_CORE_PROPERTIES, version):

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

875

s = s.split('#', 1)[0].strip()

if not s:

continue

r, p = s.split(";")

r = r.strip()

p = p.strip()

if ".." in r:

first, last = [int(c, 16) for c in r.split('..')]

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

884

chars = list(range(first, last+1))

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

else:

chars = [int(r, 16)]

for char in chars:

if table[char]:

# Some properties (e.g. Default_Ignorable_Code_Point)

890

# apply to unassigned code points; ignore them

891

table[char][-1].add(p)

892

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

893

for s in open_data(LINE_BREAK, version):

894

s = s.partition('#')[0]

895

s = [i.strip() for i in s.split(';')]

896

if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:

897

continue

898

if '..' not in s[0]:

899

first = last = int(s[0], 16)

900

else:

901

first, last = [int(c, 16) for c in s[0].split('..')]

902

for char in range(first, last+1):

903

table[char][-1].add('Line_Break')

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

904

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

905

# We only want the quickcheck properties

906

# Format: NF?_QC; Y(es)/N(o)/M(aybe)

907

# Yes is the default, hence only N and M occur

908

# In 3.2.0, the format was different (NF?_NO)

909

# The parsing will incorrectly determine these as

910

# "yes", however, unicodedata.c will not perform quickchecks

911

# for older versions, and no delta records will be created.

912

quickchecks = [0] * 0x110000

913

qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()

914

for s in open_data(DERIVEDNORMALIZATION_PROPS, version):

915

if '#' in s:

916

s = s[:s.index('#')]

917

s = [i.strip() for i in s.split(';')]

918

if len(s) < 2 or s[1] not in qc_order:

919

continue

920

quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No

921

quickcheck_shift = qc_order.index(s[1])*2

922

quickcheck <<= quickcheck_shift

923

if '..' not in s[0]:

924

first = last = int(s[0], 16)

925

else:

926

first, last = [int(c, 16) for c in s[0].split('..')]

927

for char in range(first, last+1):

928

assert not (quickchecks[char]>>quickcheck_shift)&3

929

quickchecks[char] |= quickcheck

930

for i in range(0, 0x110000):

931

if table[i] is not None:

932

table[i].append(quickchecks[i])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

933

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

934

zip = zipfile.ZipFile(open_data(UNIHAN, version))

935

if version == '3.2.0':

936

data = zip.open('Unihan-3.2.0.txt').read()

937

else:

938

data = zip.open('Unihan_NumericValues.txt').read()

939

for line in data.decode("utf-8").splitlines():

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

940

if not line.startswith('U+'):

941

continue

942

code, tag, value = line.split(None, 3)[:3]

943

if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',

944

'kOtherNumeric'):

945

continue

946

value = value.strip().replace(',', '')

947

i = int(code[2:], 16)

948

# Patch the numeric field

949

if table[i] is not None:

950

table[i][8] = value

951

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

952

def uselatin1(self):

953

# restrict character range to ISO Latin 1

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

954

self.chars = list(range(256))

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

955

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

956

# hash table tools

957

958

# this is a straight-forward reimplementation of Python's built-in

959

# dictionary type, using a static data structure, and a custom string

960

# hash algorithm.

961

962

def myhash(s, magic):

963

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

964

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

965

h = (h * magic) + c

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

966

ix = h & 0xff000000

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

967

if ix:

968

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

return h

SIZES = [

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

973

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

974

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

975

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

]

class Hash:

def __init__(self, name, data, magic):

980

# turn a (key, value) list into a static hash table structure

981

982

# determine table size

983

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

Collin Winter

2007-08-22 23:05:06 +0000

[diff] [blame]

988

raise AssertionError("ran out of polynominals")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

989

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

990

print(size, "slots in hash table")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

991

992

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

1001

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

incr = (h ^ (h >> 3)) & mask;

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1022

print(n, "collisions")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1023

self.collisions = n

1024

1025

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1035

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1036

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1037

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1038

file.write("#define %s_magic %d\n" % (self.name, self.magic))

1039

file.write("#define %s_size %d\n" % (self.name, self.size))

1040

file.write("#define %s_poly %d\n" % (self.name, self.poly))

1041

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1042

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1050

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1051

# write data to file, as a C array

1052

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1053

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1054

print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1055

file.write("static ")

1056

if size == 1:

1057

file.write("unsigned char")

1058

elif size == 2:

1059

file.write("unsigned short")

1060

else:

1061

file.write("unsigned int")

1062

file.write(" " + self.name + "[] = {\n")

1063

if self.data:

1064

s = " "

1065

for item in self.data:

1066

i = str(item) + ", "

1067

if len(s) + len(i) > 78:

file.write(s + "\n")

s = " " + i

else:

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1072

if s.strip():

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

file.write(s + "\n")

file.write("};\n\n")

def getsize(data):

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1086

def splitbins(t, trace=0):

1087

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

1088

1089

t is a sequence of ints. This function can be useful to save space if

1090

many of the ints are the same. t1 and t2 are lists of ints, and shift

1091

is an int, chosen to minimize the combined size of t1 and t2 (in C

1092

code), and where for each i in range(len(t)),

1093

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1094

where mask is a bitmask isolating the last "shift" bits.

1095

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1096

If optional arg trace is non-zero (default zero), progress info

1097

is printed to sys.stderr. The higher the value, the more info

1098

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1099

"""

1100

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1101

if trace:

1102

def dump(t1, t2, shift, bytes):

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1103

print("%d+%d bins at shift %d; %d bytes" % (

1104

len(t1), len(t2), shift, bytes), file=sys.stderr)

1105

print("Size of original table:", len(t)*getsize(t), \

1106

"bytes", file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1107

n = len(t)-1 # last valid index

1108

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

Christian Heimes

a37d4c6

2007-12-04 23:02:19 +0000

[diff] [blame]

1114

bytes = sys.maxsize # smallest total size so far

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1115

t = tuple(t) # so slices can be dict keys

1116

for shift in range(maxshift + 1):

1117

t1 = []

1118

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1119

size = 2**shift

1120

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1121

for i in range(0, len(t), size):

1122

bin = t[i:i+size]

1123

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1124

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1125

index = len(t2)

1126

bincache[bin] = index

1127

t2.extend(bin)

1128

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1129

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1130

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1131

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1132

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1133

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1134

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1135

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1136

t1, t2, shift = best

1137

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1138

print("Best:", end=' ', file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1139

dump(t1, t2, shift, bytes)

1140

if __debug__:

1141

# exhaustively verify that the decomposition is correct

1142

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

Guido van Rossum

805365e

2007-05-07 22:24:25 +0000

[diff] [blame]

1143

for i in range(len(t)):

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1144

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1145

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1146

1147

if __name__ == "__main__":

Fredrik Lundh