Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython2

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

4

# this script converts a unicode 3.2 database file to

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

5

# Modules/unicodedata_db.h, Modules/unicodename_db.h,

6

# and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

7

#

8

# history:

9

# 2000-09-24 fl created (based on bits and pieces from unidb)

10

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

11

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

12

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

13

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

14

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

15

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

16

# 2002-09-11 wd use string methods

17

# 2002-10-18 mvl update to Unicode 3.2

18

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

19

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

20

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

21

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

22

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

23

# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

24

# 2011-10-21 ezio add support for name aliases and named sequences

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

25

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

26

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

27

#

28

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

import os

import sys

import zipfile

from textwrap import dedent

34

from operator import itemgetter

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

35

36

SCRIPT = sys.argv[0]

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

37

VERSION = "3.2"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

38

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

39

# The Unicode Database

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

40

UNIDATA_VERSION = "6.0.0"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

41

UNICODE_DATA = "UnicodeData%s.txt"

42

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

43

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

44

UNIHAN = "Unihan%s.zip"

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

45

DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

46

DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

47

LINE_BREAK = "LineBreak%s.txt"

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

48

NAME_ALIASES = "NameAliases%s.txt"

49

NAMED_SEQUENCES = "NamedSequences%s.txt"

50

51

# Private Use Areas -- in planes 1, 15, 16

52

PUA_1 = range(0xE000, 0xF900)

53

PUA_15 = range(0xF0000, 0xFFFFE)

54

PUA_16 = range(0x100000, 0x10FFFE)

55

56

# we use this ranges of PUA_15 to store name aliases and named sequences

57

NAME_ALIASES_START = 0xF0000

58

NAMED_SEQUENCES_START = 0xF0100

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

59

60

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

61

62

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

63

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

64

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

65

"So" ]

66

67

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

68

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

69

"ON" ]

70

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

71

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

72

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

73

MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]

74

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

75

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

80

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

81

SPACE_MASK = 0x20

82

TITLE_MASK = 0x40

83

UPPER_MASK = 0x80

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

84

XID_START_MASK = 0x100

85

XID_CONTINUE_MASK = 0x200

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

86

PRINTABLE_MASK = 0x400

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

87

NODELTA_MASK = 0x800

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

88

NUMERIC_MASK = 0x1000

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

89

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

90

# these ranges need to match unicodedata.c:is_unified_ideograph

cjk_ranges = [

('3400', '4DB5'),

('4E00', '9FCB'),

('20000', '2A6D6'),

('2A700', '2B734'),

('2B740', '2B81D')

]

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

99

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

100

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

101

print("--- Reading", UNICODE_DATA % "", "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

102

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

103

version = ""

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

104

unicode = UnicodeData(UNIDATA_VERSION)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

105

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

106

print(len(list(filter(None, unicode.table))), "characters")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

107

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

108

for version in old_versions:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

109

print("--- Reading", UNICODE_DATA % ("-"+version), "...")

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

110

old_unicode = UnicodeData(version, cjk_check=False)

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

111

print(len(list(filter(None, old_unicode.table))), "characters")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

112

merge_old_version(version, unicode, old_unicode)

113

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

114

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

115

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

116

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

117

118

# --------------------------------------------------------------------

119

# unicode character properties

120

121

def makeunicodedata(unicode, trace):

122

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

123

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

124

table = [dummy]

125

cache = {0: dummy}

126

index = [0] * len(unicode.chars)

127

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

128

FILE = "Modules/unicodedata_db.h"

129

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

130

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

131

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

132

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

133

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

134

for char in unicode.chars:

135

record = unicode.table[char]

136

if record:

137

# extract database properties

138

category = CATEGORY_NAMES.index(record[2])

139

combining = int(record[3])

140

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

141

mirrored = record[9] == "Y"

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

142

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

143

normalizationquickcheck = record[17]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

144

item = (

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

145

category, combining, bidirectional, mirrored, eastasianwidth,

146

normalizationquickcheck

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

147

)

148

# add entry to index and item tables

149

i = cache.get(item)

150

if i is None:

151

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

155

# 2) decomposition data

156

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

157

decomp_data = [0]

158

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

159

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

160

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

161

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

162

comp_pairs = []

163

comp_first = [None] * len(unicode.chars)

164

comp_last = [None] * len(unicode.chars)

165

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

166

for char in unicode.chars:

167

record = unicode.table[char]

168

if record:

169

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

170

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

171

if len(decomp) > 19:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

172

raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

173

# prefix

174

if decomp[0][0] == "<":

175

prefix = decomp.pop(0)

176

else:

177

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

178

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

179

i = decomp_prefix.index(prefix)

180

except ValueError:

181

i = len(decomp_prefix)

182

decomp_prefix.append(prefix)

183

prefix = i

184

assert prefix < 256

185

# content

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

186

decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

187

# Collect NFC pairs

188

if not prefix and len(decomp) == 3 and \

189

char not in unicode.exclusions and \

190

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

195

try:

196

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

197

except ValueError:

198

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

199

decomp_data.extend(decomp)

200

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

201

else:

202

i = 0

203

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

204

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

205

f = l = 0

206

comp_first_ranges = []

207

comp_last_ranges = []

208

prev_f = prev_l = None

209

for i in unicode.chars:

210

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

216

prev_f = prev_f[0],i

217

else:

218

comp_first_ranges.append(prev_f)

219

prev_f = (i,i)

220

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

226

prev_l = prev_l[0],i

227

else:

228

comp_last_ranges.append(prev_l)

229

prev_l = (i,i)

230

comp_first_ranges.append(prev_f)

231

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

236

for f,l,char in comp_pairs:

237

f = comp_first[f]

238

l = comp_last[l]

239

comp_data[f*total_last+l] = char

240

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

241

print(len(table), "unique properties")

242

print(len(decomp_prefix), "unique decomposition prefixes")

243

print(len(decomp_data), "unique decomposition entries:", end=' ')

244

print(decomp_size, "bytes")

245

print(total_first, "first characters in NFC")

246

print(total_last, "last characters in NFC")

247

print(len(comp_pairs), "NFC pairs")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

248

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

249

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

250

Fred Drake

9c68505

2000-10-26 03:56:46 +0000

[diff] [blame]

251

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

252

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

253

print(file=fp)

254

print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)

255

print("/* a list of unique database records */", file=fp)

256

print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

257

for item in table:

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

258

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

259

print("};", file=fp)

260

print(file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

261

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

262

print("/* Reindexing of NFC first characters. */", file=fp)

263

print("#define TOTAL_FIRST",total_first, file=fp)

264

print("#define TOTAL_LAST",total_last, file=fp)

265

print("struct reindex{int start;short count,index;};", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

266

print("static struct reindex nfc_first[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

267

for start,end in comp_first_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

268

print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)

269

print(" {0,0,0}", file=fp)

270

print("};\n", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

271

print("static struct reindex nfc_last[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

272

for start,end in comp_last_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

273

print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)

274

print(" {0,0,0}", file=fp)

275

print("};\n", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

276

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

277

# FIXME: <fl> the following tables could be made static, and

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

278

# the support code moved into unicodedatabase.c

279

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

280

print("/* string literals */", file=fp)

281

print("const char *_PyUnicode_CategoryNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

282

for name in CATEGORY_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

283

print(" \"%s\"," % name, file=fp)

284

print(" NULL", file=fp)

285

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

286

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

287

print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

288

for name in BIDIRECTIONAL_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

289

print(" \"%s\"," % name, file=fp)

290

print(" NULL", file=fp)

291

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

292

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

293

print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

294

for name in EASTASIANWIDTH_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

295

print(" \"%s\"," % name, file=fp)

296

print(" NULL", file=fp)

297

print("};", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

298

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

299

print("static const char *decomp_prefix[] = {", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

300

for name in decomp_prefix:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

301

print(" \"%s\"," % name, file=fp)

302

print(" NULL", file=fp)

303

print("};", file=fp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

304

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

305

# split record index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

306

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

307

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

308

print("/* index tables for the database records */", file=fp)

309

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

310

Array("index1", index1).dump(fp, trace)

311

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

312

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

313

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

314

index1, index2, shift = splitbins(decomp_index, trace)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

315

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

316

print("/* decomposition data */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

317

Array("decomp_data", decomp_data).dump(fp, trace)

318

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

319

print("/* index tables for the decomposition data */", file=fp)

320

print("#define DECOMP_SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

321

Array("decomp_index1", index1).dump(fp, trace)

322

Array("decomp_index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

323

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

324

index, index2, shift = splitbins(comp_data, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

325

print("/* NFC pairs */", file=fp)

326

print("#define COMP_SHIFT", shift, file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

327

Array("comp_index", index).dump(fp, trace)

328

Array("comp_data", index2).dump(fp, trace)

329

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

330

# Generate delta tables for old versions

331

for version, table, normalization in unicode.changed:

332

cversion = version.replace(".","_")

333

records = [table[0]]

334

cache = {table[0]:0}

335

index = [0] * len(table)

336

for i, record in enumerate(table):

337

try:

338

index[i] = cache[record]

339

except KeyError:

340

index[i] = cache[record] = len(records)

341

records.append(record)

342

index1, index2, shift = splitbins(index, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

343

print("static const change_record change_records_%s[] = {" % cversion, file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

344

for record in records:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

345

print("\t{ %s }," % ", ".join(map(str,record)), file=fp)

346

print("};", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

347

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

348

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

349

print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)

350

print("{", file=fp)

351

print("\tint index;", file=fp)

352

print("\tif (n >= 0x110000) index = 0;", file=fp)

353

print("\telse {", file=fp)

354

print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)

355

print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \

356

(cversion, shift, ((1<<shift)-1)), file=fp)

357

print("\t}", file=fp)

358

print("\treturn change_records_%s+index;" % cversion, file=fp)

359

print("}\n", file=fp)

360

print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)

361

print("{", file=fp)

362

print("\tswitch(n) {", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

363

for k, v in normalization:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

364

print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)

365

print("\tdefault: return 0;", file=fp)

366

print("\t}\n}\n", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

367

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

368

fp.close()

369

370

# --------------------------------------------------------------------

371

# unicode character type tables

372

373

def makeunicodetype(unicode, trace):

374

375

FILE = "Objects/unicodetype_db.h"

376

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

377

print("--- Preparing", FILE, "...")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

378

379

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

380

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

381

table = [dummy]

382

cache = {0: dummy}

383

index = [0] * len(unicode.chars)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

384

numeric = {}

385

spaces = []

386

linebreaks = []

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

387

388

for char in unicode.chars:

389

record = unicode.table[char]

390

if record:

391

# extract database properties

392

category = record[2]

393

bidirectional = record[4]

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

394

properties = record[16]

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

395

flags = 0

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

396

delta = True

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

397

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

398

flags |= ALPHA_MASK

399

if category == "Ll":

400

flags |= LOWER_MASK

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

401

if 'Line_Break' in properties or bidirectional == "B":

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

402

flags |= LINEBREAK_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

403

linebreaks.append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

404

if category == "Zs" or bidirectional in ("WS", "B", "S"):

405

flags |= SPACE_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

406

spaces.append(char)

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

407

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

408

flags |= TITLE_MASK

409

if category == "Lu":

410

flags |= UPPER_MASK

Benjamin Peterson

0983274

2009-03-26 17:15:46 +0000

[diff] [blame]

411

if char == ord(" ") or category[0] not in ("C", "Z"):

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

412

flags |= PRINTABLE_MASK

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

413

if "XID_Start" in properties:

414

flags |= XID_START_MASK

415

if "XID_Continue" in properties:

416

flags |= XID_CONTINUE_MASK

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

417

# use delta predictor for upper/lower/title if it fits

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

418

if record[12]:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

419

upper = int(record[12], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

420

else:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

421

upper = char

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

422

if record[13]:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

423

lower = int(record[13], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

424

else:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

425

lower = char

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

426

if record[14]:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

427

title = int(record[14], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

428

else:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

429

# UCD.html says that a missing title char means that

430

# it defaults to the uppercase character, not to the

431

# character itself. Apparently, in the current UCD (5.x)

432

# this feature is never used

433

title = upper

434

upper_d = upper - char

435

lower_d = lower - char

436

title_d = title - char

437

if -32768 <= upper_d <= 32767 and \

438

-32768 <= lower_d <= 32767 and \

439

-32768 <= title_d <= 32767:

440

# use deltas

441

upper = upper_d & 0xffff

442

lower = lower_d & 0xffff

443

title = title_d & 0xffff

444

else:

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

445

flags |= NODELTA_MASK

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

446

# decimal digit, integer digit

447

decimal = 0

448

if record[6]:

449

flags |= DECIMAL_MASK

450

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

455

if record[8]:

456

flags |= NUMERIC_MASK

457

numeric.setdefault(record[8], []).append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

458

item = (

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

459

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

460

)

461

# add entry to index and item tables

462

i = cache.get(item)

463

if i is None:

464

cache[item] = i = len(table)

table.append(item)

index[char] = i

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

468

print(len(table), "unique character type entries")

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

469

print(sum(map(len, numeric.values())), "numeric code points")

470

print(len(spaces), "whitespace code points")

471

print(len(linebreaks), "linebreak code points")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

472

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

473

print("--- Writing", FILE, "...")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

474

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

475

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

476

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

477

print(file=fp)

478

print("/* a list of unique character type descriptors */", file=fp)

479

print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

480

for item in table:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

481

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

482

print("};", file=fp)

483

print(file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

484

485

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

486

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

487

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

488

print("/* type indexes */", file=fp)

489

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

490

Array("index1", index1).dump(fp, trace)

491

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

492

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

493

# Generate code for _PyUnicode_ToNumeric()

494

numeric_items = sorted(numeric.items())

495

print('/* Returns the numeric value as double for Unicode characters', file=fp)

496

print(' * having this property, -1.0 otherwise.', file=fp)

497

print(' */', file=fp)

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

498

print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

499

print('{', file=fp)

500

print(' switch (ch) {', file=fp)

501

for value, codepoints in numeric_items:

Amaury Forgeot d'Arc

919765a

2009-10-13 23:18:53 +0000

[diff] [blame]

502

# Turn text into float literals

503

parts = value.split('/')

504

parts = [repr(float(part)) for part in parts]

505

value = '/'.join(parts)

506

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

507

codepoints.sort()

508

for codepoint in codepoints:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

509

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

510

print(' return (double) %s;' % (value,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

511

print(' }', file=fp)

512

print(' return -1.0;', file=fp)

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsWhitespace()

517

print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)

518

print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)

519

print(" */", file=fp)

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

520

print('int _PyUnicode_IsWhitespace(register const Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

521

print('{', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

522

print(' switch (ch) {', file=fp)

523

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

524

for codepoint in sorted(spaces):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

525

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

526

print(' return 1;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

527

528

print(' }', file=fp)

529

print(' return 0;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsLinebreak()

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

534

print("/* Returns 1 for Unicode characters having the line break", file=fp)

535

print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)

536

print(" * type 'B', 0 otherwise.", file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

537

print(" */", file=fp)

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

538

print('int _PyUnicode_IsLinebreak(register const Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

539

print('{', file=fp)

540

print(' switch (ch) {', file=fp)

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

541

for codepoint in sorted(linebreaks):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

542

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

543

print(' return 1;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

544

545

print(' }', file=fp)

546

print(' return 0;', file=fp)

print('}', file=fp)

print(file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

550

fp.close()

551

552

# --------------------------------------------------------------------

553

# unicode name database

554

555

def makeunicodename(unicode, trace):

556

557

FILE = "Modules/unicodename_db.h"

558

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

559

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

560

561

# collect names

562

names = [None] * len(unicode.chars)

563

564

for char in unicode.chars:

565

record = unicode.table[char]

566

if record:

567

name = record[1].strip()

568

if name and name[0] != "<":

569

names[char] = name + chr(0)

570

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

571

print(len(list(n for n in names if n is not None)), "distinct names")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

572

573

# collect unique words from names (note that we differ between

574

# words inside a sentence, and words ending a sentence. the

575

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

591

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

592

print(n, "words in text;", b, "bytes")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

593

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

594

wordlist = list(words.items())

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

595

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

596

# sort on falling frequency, then by name

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

597

def word_key(a):

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

598

aword, alist = a

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

599

return -len(alist), aword

600

wordlist.sort(key=word_key)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

601

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

602

# figure out how many phrasebook escapes we need

603

escapes = 0

604

while escapes * 256 < len(wordlist):

605

escapes = escapes + 1

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

606

print(escapes, "escapes")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

607

608

short = 256 - escapes

assert short > 0

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

612

print(short, "short indexes in lexicon")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

613

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

614

# statistics

615

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

616

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

617

n = n + len(wordlist[i][1])

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

618

print(n, "short indexes in phrasebook")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

619

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

620

# pick the most commonly used words, and sort the rest on falling

621

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

622

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

623

wordlist, wordtail = wordlist[:short], wordlist[short:]

Raymond Hettinger

d4cb56d

2008-01-30 02:55:10 +0000

[diff] [blame]

624

wordtail.sort(key=lambda a: a[0], reverse=True)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

625

wordlist.extend(wordtail)

626

627

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

634

offset = 0

635

for w, x in wordlist:

636

# encoding: bit 7 indicates last character in word (chr(128)

637

# indicates the last character in an entire string)

638

ww = w[:-1] + chr(ord(w[-1])+128)

639

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

640

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

641

if o < 0:

642

o = offset

643

lexicon = lexicon + ww

644

offset = offset + len(w)

645

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

646

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

647

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

648

lexicon = list(map(ord, lexicon))

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

649

650

# generate phrasebook from names and lexicon

651

phrasebook = [0]

652

phrasebook_offset = [0] * len(unicode.chars)

653

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

658

for w in w:

659

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

660

if i < short:

661

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

662

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

663

# store as two bytes

664

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

665

phrasebook.append(i&255)

666

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

667

assert getsize(phrasebook) == 1

668

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

669

#

670

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

675

record = unicode.table[char]

676

if record:

677

name = record[1].strip()

678

if name and name[0] != "<":

679

data.append((name, char))

680

681

# the magic number 47 was chosen to minimize the number of

682

# collisions on the current data set. if you like, change it

683

# and see what happens...

684

685

codehash = Hash("code", data, 47)

686

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

687

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

688

689

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

690

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

691

print(file=fp)

692

print("#define NAME_MAXLEN", 256, file=fp)

693

print(file=fp)

694

print("/* lexicon */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

695

Array("lexicon", lexicon).dump(fp, trace)

696

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

697

698

# split decomposition index table

699

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

700

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

701

print("/* code->name phrasebook */", file=fp)

702

print("#define phrasebook_shift", shift, file=fp)

703

print("#define phrasebook_short", short, file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

704

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

705

Array("phrasebook", phrasebook).dump(fp, trace)

706

Array("phrasebook_offset1", offset1).dump(fp, trace)

707

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

708

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

709

print("/* name->code dictionary */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

710

codehash.dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

711

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

712

print(file=fp)

713

print('static const unsigned int aliases_start = %#x;' %

714

NAME_ALIASES_START, file=fp)

715

print('static const unsigned int aliases_end = %#x;' %

716

(NAME_ALIASES_START + len(unicode.aliases)), file=fp)

717

718

print('static const unsigned int name_aliases[] = {', file=fp)

719

for name, codepoint in unicode.aliases:

720

print(' 0x%04X,' % codepoint, file=fp)

721

print('};', file=fp)

722

723

# In Unicode 6.0.0, the sequences contain at most 4 BMP chars,

724

# so we are using Py_UCS2 seq[4]. This needs to be updated if longer

725

# sequences or sequences with non-BMP chars are added.

726

# unicodedata_lookup should be adapted too.

727

print(dedent("""

728

typedef struct NamedSequence {

int seqlen;

Py_UCS2 seq[4];

} named_sequence;

"""), file=fp)

print('static const unsigned int named_sequences_start = %#x;' %

735

NAMED_SEQUENCES_START, file=fp)

736

print('static const unsigned int named_sequences_end = %#x;' %

737

(NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)

738

739

print('static const named_sequence named_sequences[] = {', file=fp)

740

for name, sequence in unicode.named_sequences:

741

seq_str = ', '.join('0x%04X' % cp for cp in sequence)

742

print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)

743

print('};', file=fp)

744

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

745

fp.close()

746

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

747

748

def merge_old_version(version, new, old):

749

# Changes to exclusion file not implemented yet

750

if old.exclusions != new.exclusions:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

751

raise NotImplementedError("exclusions differ")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

752

753

# In these change records, 0xFF means "no change"

754

bidir_changes = [0xFF]*0x110000

755

category_changes = [0xFF]*0x110000

756

decimal_changes = [0xFF]*0x110000

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

757

mirrored_changes = [0xFF]*0x110000

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

758

# In numeric data, 0 means "no change",

759

# -1 means "did not have a numeric value

760

numeric_changes = [0] * 0x110000

761

# normalization_changes is a list of key-value pairs

762

normalization_changes = []

763

for i in range(0x110000):

764

if new.table[i] is None:

765

# Characters unassigned in the new version ought to

766

# be unassigned in the old one

767

assert old.table[i] is None

768

continue

769

# check characters unassigned in the old version

770

if old.table[i] is None:

771

# category 0 is "unassigned"

772

category_changes[i] = 0

773

continue

774

# check characters that differ

775

if old.table[i] != new.table[i]:

776

for k in range(len(old.table[i])):

777

if old.table[i][k] != new.table[i][k]:

778

value = old.table[i][k]

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

779

if k == 1 and i in PUA_15:

780

# the name is not set in the old.table, but in the

781

# new.table we are using it for aliases and named seq

782

assert value == ''

783

elif k == 2:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

784

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

785

category_changes[i] = CATEGORY_NAMES.index(value)

786

elif k == 4:

787

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

788

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

789

elif k == 5:

790

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

791

# We assume that all normalization changes are in 1:1 mappings

792

assert " " not in value

793

normalization_changes.append((i, value))

794

elif k == 6:

795

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

796

# we only support changes where the old value is a single digit

797

assert value in "0123456789"

798

decimal_changes[i] = int(value)

799

elif k == 8:

800

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

801

# Since 0 encodes "no change", the old value is better not 0

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

802

if not value:

803

numeric_changes[i] = -1

804

else:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

805

numeric_changes[i] = float(value)

806

assert numeric_changes[i] not in (0, -1)

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

807

elif k == 9:

808

if value == 'Y':

809

mirrored_changes[i] = '1'

810

else:

811

mirrored_changes[i] = '0'

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

812

elif k == 11:

813

# change to ISO comment, ignore

814

pass

815

elif k == 12:

816

# change to simple uppercase mapping; ignore

817

pass

818

elif k == 13:

819

# change to simple lowercase mapping; ignore

820

pass

821

elif k == 14:

822

# change to simple titlecase mapping; ignore

823

pass

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

824

elif k == 16:

825

# derived property changes; not yet

826

pass

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

827

elif k == 17:

828

# normalization quickchecks are not performed

829

# for older versions

830

pass

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

831

else:

832

class Difference(Exception):pass

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

833

raise Difference(hex(i), k, old.table[i], new.table[i])

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

834

new.changed.append((version, list(zip(bidir_changes, category_changes,

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

835

decimal_changes, mirrored_changes,

836

numeric_changes)),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

837

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

838

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

839

def open_data(template, version):

840

local = template % ('-'+version,)

841

if not os.path.exists(local):

842

import urllib.request

843

if version == '3.2.0':

844

# irregular url structure

845

url = 'http://www.unicode.org/Public/3.2-Update/' + local

846

else:

847

url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')

848

urllib.request.urlretrieve(url, filename=local)

849

if local.endswith('.txt'):

850

return open(local, encoding='utf-8')

851

else:

852

# Unihan.zip

853

return open(local, 'rb')

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

854

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

855

# --------------------------------------------------------------------

856

# the following support code is taken from the unidb utilities

857

858

859

# load a unicode-data file from disk

860

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

861

class UnicodeData:

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

862

# Record structure:

863

# [ID, name, category, combining, bidi, decomp, (6)

864

# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)

865

# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)

866

# derived-props] (17)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

867

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

868

def __init__(self, version,

869

linebreakprops=False,

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

870

expand=1,

871

cjk_check=True):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

872

self.changed = []

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

873

table = [None] * 0x110000

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

874

with open_data(UNICODE_DATA, version) as file:

while 1:

s = file.readline()

if not s:

break

s = s.strip().split(";")

880

char = int(s[0], 16)

881

table[char] = s

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

882

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

883

cjk_ranges_found = []

884

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

885

# expand first-last ranges

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

886

if expand:

887

field = None

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

888

for i in range(0, 0x110000):

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

889

s = table[i]

890

if s:

891

if s[1][-6:] == "First>":

892

s[1] = ""

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

893

field = s

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

894

elif s[1][-5:] == "Last>":

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

895

if s[1].startswith("<CJK Ideograph"):

896

cjk_ranges_found.append((field[0],

897

s[0]))

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

898

s[1] = ""

899

field = None

900

elif field:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

901

f2 = field[:]

902

f2[0] = "%X" % i

903

table[i] = f2

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

904

if cjk_check and cjk_ranges != cjk_ranges_found:

905

raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

906

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

907

# public attributes

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

908

self.filename = UNICODE_DATA % ''

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

909

self.table = table

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

910

self.chars = list(range(0x110000)) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

911

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

912

# check for name aliases and named sequences, see #12753

913

# aliases and named sequences are not in 3.2.0

914

if version != '3.2.0':

915

self.aliases = []

916

# store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,

917

# in order to take advantage of the compression and lookup

918

# algorithms used for the other characters

919

pua_index = NAME_ALIASES_START

920

with open_data(NAME_ALIASES, version) as file:

921

for s in file:

922

s = s.strip()

923

if not s or s.startswith('#'):

924

continue

925

char, name = s.split(';')

926

char = int(char, 16)

927

self.aliases.append((name, char))

928

# also store the name in the PUA 1

929

self.table[pua_index][1] = name

930

pua_index += 1

931

assert pua_index - NAME_ALIASES_START == len(self.aliases)

932

933

self.named_sequences = []

934

# store named seqences in the PUA 1, in range U+F0100..,

935

# in order to take advantage of the compression and lookup

936

# algorithms used for the other characters.

937

938

pua_index = NAMED_SEQUENCES_START

939

with open_data(NAMED_SEQUENCES, version) as file:

940

for s in file:

941

s = s.strip()

942

if not s or s.startswith('#'):

943

continue

944

name, chars = s.split(';')

945

chars = tuple(int(char, 16) for char in chars.split())

946

# check that the structure defined in makeunicodename is OK

947

assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"

948

assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "

949

"the NamedSequence struct and in unicodedata_lookup")

950

self.named_sequences.append((name, chars))

951

# also store these in the PUA 1

952

self.table[pua_index][1] = name

953

pua_index += 1

954

assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)

955

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

956

self.exclusions = {}

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

957

with open_data(COMPOSITION_EXCLUSIONS, version) as file:

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

char = int(s.split()[0],16)

965

self.exclusions[char] = 1

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

966

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

967

widths = [None] * 0x110000

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

968

with open_data(EASTASIAN_WIDTH, version) as file:

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

s = s.split()[0].split(';')

976

if '..' in s[0]:

977

first, last = [int(c, 16) for c in s[0].split('..')]

978

chars = list(range(first, last+1))

979

else:

980

chars = [int(s[0], 16)]

for char in chars:

widths[char] = s[1]

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

984

for i in range(0, 0x110000):

985

if table[i] is not None:

986

table[i].append(widths[i])

987

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

988

for i in range(0, 0x110000):

989

if table[i] is not None:

990

table[i].append(set())

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

991

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

992

with open_data(DERIVED_CORE_PROPERTIES, version) as file:

993

for s in file:

994

s = s.split('#', 1)[0].strip()

995

if not s:

996

continue

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

997

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

r, p = s.split(";")

r = r.strip()

p = p.strip()

if ".." in r:

first, last = [int(c, 16) for c in r.split('..')]

1003

chars = list(range(first, last+1))

else:

chars = [int(r, 16)]

for char in chars:

if table[char]:

# Some properties (e.g. Default_Ignorable_Code_Point)

1009

# apply to unassigned code points; ignore them

1010

table[char][-1].add(p)

1011

1012

with open_data(LINE_BREAK, version) as file:

1013

for s in file:

1014

s = s.partition('#')[0]

1015

s = [i.strip() for i in s.split(';')]

1016

if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:

1017

continue

1018

if '..' not in s[0]:

1019

first = last = int(s[0], 16)

1020

else:

1021

first, last = [int(c, 16) for c in s[0].split('..')]

1022

for char in range(first, last+1):

1023

table[char][-1].add('Line_Break')

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

1024

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1025

# We only want the quickcheck properties

1026

# Format: NF?_QC; Y(es)/N(o)/M(aybe)

1027

# Yes is the default, hence only N and M occur

1028

# In 3.2.0, the format was different (NF?_NO)

1029

# The parsing will incorrectly determine these as

1030

# "yes", however, unicodedata.c will not perform quickchecks

1031

# for older versions, and no delta records will be created.

1032

quickchecks = [0] * 0x110000

1033

qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1034

with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:

for s in file:

if '#' in s:

s = s[:s.index('#')]

s = [i.strip() for i in s.split(';')]

1039

if len(s) < 2 or s[1] not in qc_order:

1040

continue

1041

quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No

1042

quickcheck_shift = qc_order.index(s[1])*2

1043

quickcheck <<= quickcheck_shift

1044

if '..' not in s[0]:

1045

first = last = int(s[0], 16)

1046

else:

1047

first, last = [int(c, 16) for c in s[0].split('..')]

1048

for char in range(first, last+1):

1049

assert not (quickchecks[char]>>quickcheck_shift)&3

1050

quickchecks[char] |= quickcheck

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1051

for i in range(0, 0x110000):

1052

if table[i] is not None:

1053

table[i].append(quickchecks[i])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

1054

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1055

with open_data(UNIHAN, version) as file:

1056

zip = zipfile.ZipFile(file)

1057

if version == '3.2.0':

1058

data = zip.open('Unihan-3.2.0.txt').read()

1059

else:

1060

data = zip.open('Unihan_NumericValues.txt').read()

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1061

for line in data.decode("utf-8").splitlines():

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

1062

if not line.startswith('U+'):

1063

continue

1064

code, tag, value = line.split(None, 3)[:3]

1065

if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',

1066

'kOtherNumeric'):

1067

continue

1068

value = value.strip().replace(',', '')

1069

i = int(code[2:], 16)

1070

# Patch the numeric field

1071

if table[i] is not None:

1072

table[i][8] = value

1073

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1074

def uselatin1(self):

1075

# restrict character range to ISO Latin 1

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

1076

self.chars = list(range(256))

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1077

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1078

# hash table tools

1079

1080

# this is a straight-forward reimplementation of Python's built-in

1081

# dictionary type, using a static data structure, and a custom string

1082

# hash algorithm.

1083

1084

def myhash(s, magic):

1085

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1086

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1087

h = (h * magic) + c

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

1088

ix = h & 0xff000000

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1089

if ix:

1090

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

return h

SIZES = [

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

1095

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

1096

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

1097

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

]

class Hash:

def __init__(self, name, data, magic):

1102

# turn a (key, value) list into a static hash table structure

1103

1104

# determine table size

1105

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

Ezio Melotti

1392500

2011-03-16 11:05:33 +0200

[diff] [blame]

1110

raise AssertionError("ran out of polynomials")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1111

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1112

print(size, "slots in hash table")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1113

1114

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

1123

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

incr = (h ^ (h >> 3)) & mask;

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1144

print(n, "collisions")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1145

self.collisions = n

1146

1147

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1157

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1158

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1159

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1160

file.write("#define %s_magic %d\n" % (self.name, self.magic))

1161

file.write("#define %s_size %d\n" % (self.name, self.size))

1162

file.write("#define %s_poly %d\n" % (self.name, self.poly))

1163

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1164

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1172

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1173

# write data to file, as a C array

1174

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1175

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1176

print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1177

file.write("static ")

1178

if size == 1:

1179

file.write("unsigned char")

1180

elif size == 2:

1181

file.write("unsigned short")

1182

else:

1183

file.write("unsigned int")

1184

file.write(" " + self.name + "[] = {\n")

1185

if self.data:

1186

s = " "

1187

for item in self.data:

1188

i = str(item) + ", "

1189

if len(s) + len(i) > 78:

file.write(s + "\n")

s = " " + i

else:

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1194

if s.strip():

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

file.write(s + "\n")

file.write("};\n\n")

def getsize(data):

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1208

def splitbins(t, trace=0):

1209

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

1210

1211

t is a sequence of ints. This function can be useful to save space if

1212

many of the ints are the same. t1 and t2 are lists of ints, and shift

1213

is an int, chosen to minimize the combined size of t1 and t2 (in C

1214

code), and where for each i in range(len(t)),

1215

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1216

where mask is a bitmask isolating the last "shift" bits.

1217

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1218

If optional arg trace is non-zero (default zero), progress info

1219

is printed to sys.stderr. The higher the value, the more info

1220

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1221

"""

1222

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1223

if trace:

1224

def dump(t1, t2, shift, bytes):

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1225

print("%d+%d bins at shift %d; %d bytes" % (

1226

len(t1), len(t2), shift, bytes), file=sys.stderr)

1227

print("Size of original table:", len(t)*getsize(t), \

1228

"bytes", file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1229

n = len(t)-1 # last valid index

1230

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

Christian Heimes

a37d4c6

2007-12-04 23:02:19 +0000

[diff] [blame]

1236

bytes = sys.maxsize # smallest total size so far

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1237

t = tuple(t) # so slices can be dict keys

1238

for shift in range(maxshift + 1):

1239

t1 = []

1240

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1241

size = 2**shift

1242

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1243

for i in range(0, len(t), size):

1244

bin = t[i:i+size]

1245

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1246

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1247

index = len(t2)

1248

bincache[bin] = index

1249

t2.extend(bin)

1250

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1251

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1252

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1253

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1254

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1255

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1256

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1257

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1258

t1, t2, shift = best

1259

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1260

print("Best:", end=' ', file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1261

dump(t1, t2, shift, bytes)

1262

if __debug__:

1263

# exhaustively verify that the decomposition is correct

1264

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

Guido van Rossum

805365e

2007-05-07 22:24:25 +0000

[diff] [blame]

1265

for i in range(len(t)):

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1266

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1267

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1268

1269

if __name__ == "__main__":

Fredrik Lundh