Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython2

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

4

# this script converts a unicode 3.2 database file to

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

5

# Modules/unicodedata_db.h, Modules/unicodename_db.h,

6

# and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

7

#

8

# history:

9

# 2000-09-24 fl created (based on bits and pieces from unidb)

10

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

11

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

12

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

13

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

14

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

15

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

16

# 2002-09-11 wd use string methods

17

# 2002-10-18 mvl update to Unicode 3.2

18

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

19

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

20

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

21

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

22

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

23

# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

24

# 2011-10-21 ezio add support for name aliases and named sequences

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame^]

25

# 2012-01 benjamin add full case mappings

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

26

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

27

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

28

#

29

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

import os

import sys

import zipfile

from textwrap import dedent

35

from operator import itemgetter

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

36

37

SCRIPT = sys.argv[0]

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

38

VERSION = "3.2"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

39

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

40

# The Unicode Database

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

41

UNIDATA_VERSION = "6.0.0"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

42

UNICODE_DATA = "UnicodeData%s.txt"

43

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

44

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

45

UNIHAN = "Unihan%s.zip"

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

46

DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

47

DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

48

LINE_BREAK = "LineBreak%s.txt"

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

49

NAME_ALIASES = "NameAliases%s.txt"

50

NAMED_SEQUENCES = "NamedSequences%s.txt"

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame^]

51

SPECIAL_CASING = "SpecialCasing%s.txt"

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

52

53

# Private Use Areas -- in planes 1, 15, 16

54

PUA_1 = range(0xE000, 0xF900)

55

PUA_15 = range(0xF0000, 0xFFFFE)

56

PUA_16 = range(0x100000, 0x10FFFE)

57

58

# we use this ranges of PUA_15 to store name aliases and named sequences

59

NAME_ALIASES_START = 0xF0000

60

NAMED_SEQUENCES_START = 0xF0100

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

61

62

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

63

64

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

65

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

66

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

67

"So" ]

68

69

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

70

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

71

"ON" ]

72

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

73

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

74

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

75

MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]

76

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

77

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

82

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

83

SPACE_MASK = 0x20

84

TITLE_MASK = 0x40

85

UPPER_MASK = 0x80

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

86

XID_START_MASK = 0x100

87

XID_CONTINUE_MASK = 0x200

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

88

PRINTABLE_MASK = 0x400

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame^]

89

NUMERIC_MASK = 0x800

90

CASE_IGNORABLE_MASK = 0x1000

91

CASED_MASK = 0x2000

92

EXTENDED_CASE_MASK = 0x4000

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

93

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

94

# these ranges need to match unicodedata.c:is_unified_ideograph

cjk_ranges = [

('3400', '4DB5'),

('4E00', '9FCB'),

('20000', '2A6D6'),

('2A700', '2B734'),

('2B740', '2B81D')

]

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

103

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

104

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

105

print("--- Reading", UNICODE_DATA % "", "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

106

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

107

version = ""

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

108

unicode = UnicodeData(UNIDATA_VERSION)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

109

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

110

print(len(list(filter(None, unicode.table))), "characters")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

111

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

112

for version in old_versions:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

113

print("--- Reading", UNICODE_DATA % ("-"+version), "...")

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

114

old_unicode = UnicodeData(version, cjk_check=False)

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

115

print(len(list(filter(None, old_unicode.table))), "characters")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

116

merge_old_version(version, unicode, old_unicode)

117

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

118

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

119

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

120

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

121

122

# --------------------------------------------------------------------

123

# unicode character properties

124

125

def makeunicodedata(unicode, trace):

126

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

127

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

128

table = [dummy]

129

cache = {0: dummy}

130

index = [0] * len(unicode.chars)

131

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

132

FILE = "Modules/unicodedata_db.h"

133

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

134

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

135

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

136

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

137

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

138

for char in unicode.chars:

139

record = unicode.table[char]

140

if record:

141

# extract database properties

142

category = CATEGORY_NAMES.index(record[2])

143

combining = int(record[3])

144

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

145

mirrored = record[9] == "Y"

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

146

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

147

normalizationquickcheck = record[17]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

148

item = (

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

149

category, combining, bidirectional, mirrored, eastasianwidth,

150

normalizationquickcheck

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

151

)

152

# add entry to index and item tables

153

i = cache.get(item)

154

if i is None:

155

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

159

# 2) decomposition data

160

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

161

decomp_data = [0]

162

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

163

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

164

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

165

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

166

comp_pairs = []

167

comp_first = [None] * len(unicode.chars)

168

comp_last = [None] * len(unicode.chars)

169

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

170

for char in unicode.chars:

171

record = unicode.table[char]

172

if record:

173

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

174

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

175

if len(decomp) > 19:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

176

raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

177

# prefix

178

if decomp[0][0] == "<":

179

prefix = decomp.pop(0)

180

else:

181

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

182

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

183

i = decomp_prefix.index(prefix)

184

except ValueError:

185

i = len(decomp_prefix)

186

decomp_prefix.append(prefix)

187

prefix = i

188

assert prefix < 256

189

# content

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

190

decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

191

# Collect NFC pairs

192

if not prefix and len(decomp) == 3 and \

193

char not in unicode.exclusions and \

194

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

199

try:

200

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

201

except ValueError:

202

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

203

decomp_data.extend(decomp)

204

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

205

else:

206

i = 0

207

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

208

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

209

f = l = 0

210

comp_first_ranges = []

211

comp_last_ranges = []

212

prev_f = prev_l = None

213

for i in unicode.chars:

214

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

220

prev_f = prev_f[0],i

221

else:

222

comp_first_ranges.append(prev_f)

223

prev_f = (i,i)

224

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

230

prev_l = prev_l[0],i

231

else:

232

comp_last_ranges.append(prev_l)

233

prev_l = (i,i)

234

comp_first_ranges.append(prev_f)

235

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

240

for f,l,char in comp_pairs:

241

f = comp_first[f]

242

l = comp_last[l]

243

comp_data[f*total_last+l] = char

244

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

245

print(len(table), "unique properties")

246

print(len(decomp_prefix), "unique decomposition prefixes")

247

print(len(decomp_data), "unique decomposition entries:", end=' ')

248

print(decomp_size, "bytes")

249

print(total_first, "first characters in NFC")

250

print(total_last, "last characters in NFC")

251

print(len(comp_pairs), "NFC pairs")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

252

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

253

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

254

Fred Drake

9c68505

2000-10-26 03:56:46 +0000

[diff] [blame]

255

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

256

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

257

print(file=fp)

258

print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)

259

print("/* a list of unique database records */", file=fp)

260

print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

261

for item in table:

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

262

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

263

print("};", file=fp)

264

print(file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

265

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

266

print("/* Reindexing of NFC first characters. */", file=fp)

267

print("#define TOTAL_FIRST",total_first, file=fp)

268

print("#define TOTAL_LAST",total_last, file=fp)

269

print("struct reindex{int start;short count,index;};", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

270

print("static struct reindex nfc_first[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

271

for start,end in comp_first_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

272

print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)

273

print(" {0,0,0}", file=fp)

274

print("};\n", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

275

print("static struct reindex nfc_last[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

276

for start,end in comp_last_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

277

print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)

278

print(" {0,0,0}", file=fp)

279

print("};\n", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

280

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

281

# FIXME: <fl> the following tables could be made static, and

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

282

# the support code moved into unicodedatabase.c

283

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

284

print("/* string literals */", file=fp)

285

print("const char *_PyUnicode_CategoryNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

286

for name in CATEGORY_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

287

print(" \"%s\"," % name, file=fp)

288

print(" NULL", file=fp)

289

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

290

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

291

print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

292

for name in BIDIRECTIONAL_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

293

print(" \"%s\"," % name, file=fp)

294

print(" NULL", file=fp)

295

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

296

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

297

print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

298

for name in EASTASIANWIDTH_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

299

print(" \"%s\"," % name, file=fp)

300

print(" NULL", file=fp)

301

print("};", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

302

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

303

print("static const char *decomp_prefix[] = {", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

304

for name in decomp_prefix:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

305

print(" \"%s\"," % name, file=fp)

306

print(" NULL", file=fp)

307

print("};", file=fp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

308

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

309

# split record index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

310

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

311

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

312

print("/* index tables for the database records */", file=fp)

313

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

314

Array("index1", index1).dump(fp, trace)

315

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

316

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

317

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

318

index1, index2, shift = splitbins(decomp_index, trace)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

319

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

320

print("/* decomposition data */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

321

Array("decomp_data", decomp_data).dump(fp, trace)

322

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

323

print("/* index tables for the decomposition data */", file=fp)

324

print("#define DECOMP_SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

325

Array("decomp_index1", index1).dump(fp, trace)

326

Array("decomp_index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

327

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

328

index, index2, shift = splitbins(comp_data, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

329

print("/* NFC pairs */", file=fp)

330

print("#define COMP_SHIFT", shift, file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

331

Array("comp_index", index).dump(fp, trace)

332

Array("comp_data", index2).dump(fp, trace)

333

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

334

# Generate delta tables for old versions

335

for version, table, normalization in unicode.changed:

336

cversion = version.replace(".","_")

337

records = [table[0]]

338

cache = {table[0]:0}

339

index = [0] * len(table)

340

for i, record in enumerate(table):

341

try:

342

index[i] = cache[record]

343

except KeyError:

344

index[i] = cache[record] = len(records)

345

records.append(record)

346

index1, index2, shift = splitbins(index, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

347

print("static const change_record change_records_%s[] = {" % cversion, file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

348

for record in records:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

349

print("\t{ %s }," % ", ".join(map(str,record)), file=fp)

350

print("};", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

351

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

352

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

353

print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)

354

print("{", file=fp)

355

print("\tint index;", file=fp)

356

print("\tif (n >= 0x110000) index = 0;", file=fp)

357

print("\telse {", file=fp)

358

print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)

359

print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \

360

(cversion, shift, ((1<<shift)-1)), file=fp)

361

print("\t}", file=fp)

362

print("\treturn change_records_%s+index;" % cversion, file=fp)

363

print("}\n", file=fp)

364

print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)

365

print("{", file=fp)

366

print("\tswitch(n) {", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

367

for k, v in normalization:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

368

print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)

369

print("\tdefault: return 0;", file=fp)

370

print("\t}\n}\n", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

371

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

372

fp.close()

373

374

# --------------------------------------------------------------------

375

# unicode character type tables

376

377

def makeunicodetype(unicode, trace):

378

379

FILE = "Objects/unicodetype_db.h"

380

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

381

print("--- Preparing", FILE, "...")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

382

383

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

384

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

385

table = [dummy]

386

cache = {0: dummy}

387

index = [0] * len(unicode.chars)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

388

numeric = {}

389

spaces = []

390

linebreaks = []

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame^]

391

extra_casing = []

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

392

393

for char in unicode.chars:

394

record = unicode.table[char]

395

if record:

396

# extract database properties

397

category = record[2]

398

bidirectional = record[4]

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

399

properties = record[16]

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

400

flags = 0

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

401

delta = True

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

402

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

403

flags |= ALPHA_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame^]

404

if "Lowercase" in properties:

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

405

flags |= LOWER_MASK

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

406

if 'Line_Break' in properties or bidirectional == "B":

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

407

flags |= LINEBREAK_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

408

linebreaks.append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

409

if category == "Zs" or bidirectional in ("WS", "B", "S"):

410

flags |= SPACE_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

411

spaces.append(char)

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

412

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

413

flags |= TITLE_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame^]

414

if "Uppercase" in properties:

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

415

flags |= UPPER_MASK

Benjamin Peterson

0983274

2009-03-26 17:15:46 +0000

[diff] [blame]

416

if char == ord(" ") or category[0] not in ("C", "Z"):

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

417

flags |= PRINTABLE_MASK

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

418

if "XID_Start" in properties:

419

flags |= XID_START_MASK

420

if "XID_Continue" in properties:

421

flags |= XID_CONTINUE_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame^]

422

if "Cased" in properties:

423

flags |= CASED_MASK

424

if "Case_Ignorable" in properties:

425

flags |= CASE_IGNORABLE_MASK

426

sc = unicode.special_casing.get(char)

427

if sc is None:

428

if record[12]:

429

upper = int(record[12], 16)

else:

upper = char

if record[13]:

lower = int(record[13], 16)

else:

lower = char

if record[14]:

title = int(record[14], 16)

438

else:

439

title = upper

440

if upper == lower == title:

441

upper = lower = title = 0

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

442

else:

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame^]

443

# This happens when some character maps to more than one

444

# character in uppercase, lowercase, or titlecase. The extra

445

# characters are stored in a different array.

446

flags |= EXTENDED_CASE_MASK

447

lower = len(extra_casing) | (len(sc[0]) << 24)

448

extra_casing.extend(sc[0])

449

upper = len(extra_casing) | (len(sc[2]) << 24)

450

extra_casing.extend(sc[2])

451

# Title is probably equal to upper.

if sc[1] == sc[2]:

title = upper

else:

title = len(extra_casing) | (len(sc[1]) << 24)

456

extra_casing.extend(sc[1])

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

457

# decimal digit, integer digit

458

decimal = 0

459

if record[6]:

460

flags |= DECIMAL_MASK

461

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

466

if record[8]:

467

flags |= NUMERIC_MASK

468

numeric.setdefault(record[8], []).append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

469

item = (

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

470

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

471

)

472

# add entry to index and item tables

473

i = cache.get(item)

474

if i is None:

475

cache[item] = i = len(table)

table.append(item)

index[char] = i

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

479

print(len(table), "unique character type entries")

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

480

print(sum(map(len, numeric.values())), "numeric code points")

481

print(len(spaces), "whitespace code points")

482

print(len(linebreaks), "linebreak code points")

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame^]

483

print(len(extra_casing), "extended case array")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

484

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

485

print("--- Writing", FILE, "...")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

486

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

487

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

488

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

489

print(file=fp)

490

print("/* a list of unique character type descriptors */", file=fp)

491

print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

492

for item in table:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

493

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

494

print("};", file=fp)

495

print(file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

496

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame^]

497

print("/* extended case mappings */", file=fp)

498

print(file=fp)

499

print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)

500

for c in extra_casing:

501

print(" %d," % c, file=fp)

print("};", file=fp)

print(file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

505

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

506

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

507

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

508

print("/* type indexes */", file=fp)

509

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

510

Array("index1", index1).dump(fp, trace)

511

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

512

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

513

# Generate code for _PyUnicode_ToNumeric()

514

numeric_items = sorted(numeric.items())

515

print('/* Returns the numeric value as double for Unicode characters', file=fp)

516

print(' * having this property, -1.0 otherwise.', file=fp)

517

print(' */', file=fp)

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

518

print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

519

print('{', file=fp)

520

print(' switch (ch) {', file=fp)

521

for value, codepoints in numeric_items:

Amaury Forgeot d'Arc

919765a

2009-10-13 23:18:53 +0000

[diff] [blame]

522

# Turn text into float literals

523

parts = value.split('/')

524

parts = [repr(float(part)) for part in parts]

525

value = '/'.join(parts)

526

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

527

codepoints.sort()

528

for codepoint in codepoints:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

529

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

530

print(' return (double) %s;' % (value,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

531

print(' }', file=fp)

532

print(' return -1.0;', file=fp)

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsWhitespace()

537

print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)

538

print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)

539

print(" */", file=fp)

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

540

print('int _PyUnicode_IsWhitespace(register const Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

541

print('{', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

542

print(' switch (ch) {', file=fp)

543

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

544

for codepoint in sorted(spaces):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

545

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

546

print(' return 1;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

547

548

print(' }', file=fp)

549

print(' return 0;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsLinebreak()

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

554

print("/* Returns 1 for Unicode characters having the line break", file=fp)

555

print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)

556

print(" * type 'B', 0 otherwise.", file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

557

print(" */", file=fp)

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

558

print('int _PyUnicode_IsLinebreak(register const Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

559

print('{', file=fp)

560

print(' switch (ch) {', file=fp)

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

561

for codepoint in sorted(linebreaks):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

562

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

563

print(' return 1;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

564

565

print(' }', file=fp)

566

print(' return 0;', file=fp)

print('}', file=fp)

print(file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

570

fp.close()

571

572

# --------------------------------------------------------------------

573

# unicode name database

574

575

def makeunicodename(unicode, trace):

576

577

FILE = "Modules/unicodename_db.h"

578

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

579

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

580

581

# collect names

582

names = [None] * len(unicode.chars)

583

584

for char in unicode.chars:

585

record = unicode.table[char]

586

if record:

587

name = record[1].strip()

588

if name and name[0] != "<":

589

names[char] = name + chr(0)

590

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

591

print(len(list(n for n in names if n is not None)), "distinct names")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

592

593

# collect unique words from names (note that we differ between

594

# words inside a sentence, and words ending a sentence. the

595

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

611

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

612

print(n, "words in text;", b, "bytes")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

613

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

614

wordlist = list(words.items())

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

615

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

616

# sort on falling frequency, then by name

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

617

def word_key(a):

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

618

aword, alist = a

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

619

return -len(alist), aword

620

wordlist.sort(key=word_key)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

621

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

622

# figure out how many phrasebook escapes we need

623

escapes = 0

624

while escapes * 256 < len(wordlist):

625

escapes = escapes + 1

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

626

print(escapes, "escapes")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

627

628

short = 256 - escapes

assert short > 0

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

632

print(short, "short indexes in lexicon")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

633

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

634

# statistics

635

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

636

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

637

n = n + len(wordlist[i][1])

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

638

print(n, "short indexes in phrasebook")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

639

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

640

# pick the most commonly used words, and sort the rest on falling

641

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

642

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

643

wordlist, wordtail = wordlist[:short], wordlist[short:]

Raymond Hettinger

d4cb56d

2008-01-30 02:55:10 +0000

[diff] [blame]

644

wordtail.sort(key=lambda a: a[0], reverse=True)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

645

wordlist.extend(wordtail)

646

647

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

654

offset = 0

655

for w, x in wordlist:

656

# encoding: bit 7 indicates last character in word (chr(128)

657

# indicates the last character in an entire string)

658

ww = w[:-1] + chr(ord(w[-1])+128)

659

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

660

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

661

if o < 0:

662

o = offset

663

lexicon = lexicon + ww

664

offset = offset + len(w)

665

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

666

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

667

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

668

lexicon = list(map(ord, lexicon))

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

669

670

# generate phrasebook from names and lexicon

671

phrasebook = [0]

672

phrasebook_offset = [0] * len(unicode.chars)

673

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

678

for w in w:

679

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

680

if i < short:

681

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

682

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

683

# store as two bytes

684

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

685

phrasebook.append(i&255)

686

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

687

assert getsize(phrasebook) == 1

688

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

689

#

690

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

695

record = unicode.table[char]

696

if record:

697

name = record[1].strip()

698

if name and name[0] != "<":

699

data.append((name, char))

700

701

# the magic number 47 was chosen to minimize the number of

702

# collisions on the current data set. if you like, change it

703

# and see what happens...

704

705

codehash = Hash("code", data, 47)

706

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

707

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

708

709

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

710

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

711

print(file=fp)

712

print("#define NAME_MAXLEN", 256, file=fp)

713

print(file=fp)

714

print("/* lexicon */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

715

Array("lexicon", lexicon).dump(fp, trace)

716

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

717

718

# split decomposition index table

719

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

720

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

721

print("/* code->name phrasebook */", file=fp)

722

print("#define phrasebook_shift", shift, file=fp)

723

print("#define phrasebook_short", short, file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

724

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

725

Array("phrasebook", phrasebook).dump(fp, trace)

726

Array("phrasebook_offset1", offset1).dump(fp, trace)

727

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

728

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

729

print("/* name->code dictionary */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

730

codehash.dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

731

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

732

print(file=fp)

733

print('static const unsigned int aliases_start = %#x;' %

734

NAME_ALIASES_START, file=fp)

735

print('static const unsigned int aliases_end = %#x;' %

736

(NAME_ALIASES_START + len(unicode.aliases)), file=fp)

737

738

print('static const unsigned int name_aliases[] = {', file=fp)

739

for name, codepoint in unicode.aliases:

740

print(' 0x%04X,' % codepoint, file=fp)

741

print('};', file=fp)

742

743

# In Unicode 6.0.0, the sequences contain at most 4 BMP chars,

744

# so we are using Py_UCS2 seq[4]. This needs to be updated if longer

745

# sequences or sequences with non-BMP chars are added.

746

# unicodedata_lookup should be adapted too.

747

print(dedent("""

748

typedef struct NamedSequence {

int seqlen;

Py_UCS2 seq[4];

} named_sequence;

"""), file=fp)

print('static const unsigned int named_sequences_start = %#x;' %

755

NAMED_SEQUENCES_START, file=fp)

756

print('static const unsigned int named_sequences_end = %#x;' %

757

(NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)

758

759

print('static const named_sequence named_sequences[] = {', file=fp)

760

for name, sequence in unicode.named_sequences:

761

seq_str = ', '.join('0x%04X' % cp for cp in sequence)

762

print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)

763

print('};', file=fp)

764

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

765

fp.close()

766

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

767

768

def merge_old_version(version, new, old):

769

# Changes to exclusion file not implemented yet

770

if old.exclusions != new.exclusions:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

771

raise NotImplementedError("exclusions differ")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

772

773

# In these change records, 0xFF means "no change"

774

bidir_changes = [0xFF]*0x110000

775

category_changes = [0xFF]*0x110000

776

decimal_changes = [0xFF]*0x110000

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

777

mirrored_changes = [0xFF]*0x110000

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

778

# In numeric data, 0 means "no change",

779

# -1 means "did not have a numeric value

780

numeric_changes = [0] * 0x110000

781

# normalization_changes is a list of key-value pairs

782

normalization_changes = []

783

for i in range(0x110000):

784

if new.table[i] is None:

785

# Characters unassigned in the new version ought to

786

# be unassigned in the old one

787

assert old.table[i] is None

788

continue

789

# check characters unassigned in the old version

790

if old.table[i] is None:

791

# category 0 is "unassigned"

792

category_changes[i] = 0

793

continue

794

# check characters that differ

795

if old.table[i] != new.table[i]:

796

for k in range(len(old.table[i])):

797

if old.table[i][k] != new.table[i][k]:

798

value = old.table[i][k]

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

799

if k == 1 and i in PUA_15:

800

# the name is not set in the old.table, but in the

801

# new.table we are using it for aliases and named seq

802

assert value == ''

803

elif k == 2:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

804

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

805

category_changes[i] = CATEGORY_NAMES.index(value)

806

elif k == 4:

807

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

808

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

809

elif k == 5:

810

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

811

# We assume that all normalization changes are in 1:1 mappings

812

assert " " not in value

813

normalization_changes.append((i, value))

814

elif k == 6:

815

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

816

# we only support changes where the old value is a single digit

817

assert value in "0123456789"

818

decimal_changes[i] = int(value)

819

elif k == 8:

820

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

821

# Since 0 encodes "no change", the old value is better not 0

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

822

if not value:

823

numeric_changes[i] = -1

824

else:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

825

numeric_changes[i] = float(value)

826

assert numeric_changes[i] not in (0, -1)

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

827

elif k == 9:

828

if value == 'Y':

829

mirrored_changes[i] = '1'

830

else:

831

mirrored_changes[i] = '0'

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

832

elif k == 11:

833

# change to ISO comment, ignore

834

pass

835

elif k == 12:

836

# change to simple uppercase mapping; ignore

837

pass

838

elif k == 13:

839

# change to simple lowercase mapping; ignore

840

pass

841

elif k == 14:

842

# change to simple titlecase mapping; ignore

843

pass

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

844

elif k == 16:

845

# derived property changes; not yet

846

pass

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

847

elif k == 17:

848

# normalization quickchecks are not performed

849

# for older versions

850

pass

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

851

else:

852

class Difference(Exception):pass

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

853

raise Difference(hex(i), k, old.table[i], new.table[i])

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

854

new.changed.append((version, list(zip(bidir_changes, category_changes,

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

855

decimal_changes, mirrored_changes,

856

numeric_changes)),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

857

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

858

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

859

def open_data(template, version):

860

local = template % ('-'+version,)

861

if not os.path.exists(local):

862

import urllib.request

863

if version == '3.2.0':

864

# irregular url structure

865

url = 'http://www.unicode.org/Public/3.2-Update/' + local

866

else:

867

url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')

868

urllib.request.urlretrieve(url, filename=local)

869

if local.endswith('.txt'):

870

return open(local, encoding='utf-8')

871

else:

872

# Unihan.zip

873

return open(local, 'rb')

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

874

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

875

# --------------------------------------------------------------------

876

# the following support code is taken from the unidb utilities

877

878

879

# load a unicode-data file from disk

880

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

881

class UnicodeData:

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

882

# Record structure:

883

# [ID, name, category, combining, bidi, decomp, (6)

884

# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)

885

# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)

886

# derived-props] (17)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

887

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

888

def __init__(self, version,

889

linebreakprops=False,

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

890

expand=1,

891

cjk_check=True):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

892

self.changed = []

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

893

table = [None] * 0x110000

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

894

with open_data(UNICODE_DATA, version) as file:

while 1:

s = file.readline()

if not s:

break

s = s.strip().split(";")

900

char = int(s[0], 16)

901

table[char] = s

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

902

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

903

cjk_ranges_found = []

904

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

905

# expand first-last ranges

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

906

if expand:

907

field = None

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

908

for i in range(0, 0x110000):

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

909

s = table[i]

910

if s:

911

if s[1][-6:] == "First>":

912

s[1] = ""

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

913

field = s

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

914

elif s[1][-5:] == "Last>":

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

915

if s[1].startswith("<CJK Ideograph"):

916

cjk_ranges_found.append((field[0],

917

s[0]))

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

918

s[1] = ""

919

field = None

920

elif field:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

921

f2 = field[:]

922

f2[0] = "%X" % i

923

table[i] = f2

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

924

if cjk_check and cjk_ranges != cjk_ranges_found:

925

raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

926

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

927

# public attributes

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

928

self.filename = UNICODE_DATA % ''

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

929

self.table = table

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

930

self.chars = list(range(0x110000)) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

931

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

932

# check for name aliases and named sequences, see #12753

933

# aliases and named sequences are not in 3.2.0

934

if version != '3.2.0':

935

self.aliases = []

936

# store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,

937

# in order to take advantage of the compression and lookup

938

# algorithms used for the other characters

939

pua_index = NAME_ALIASES_START

940

with open_data(NAME_ALIASES, version) as file:

941

for s in file:

942

s = s.strip()

943

if not s or s.startswith('#'):

944

continue

945

char, name = s.split(';')

946

char = int(char, 16)

947

self.aliases.append((name, char))

948

# also store the name in the PUA 1

949

self.table[pua_index][1] = name

950

pua_index += 1

951

assert pua_index - NAME_ALIASES_START == len(self.aliases)

952

953

self.named_sequences = []

954

# store named seqences in the PUA 1, in range U+F0100..,

955

# in order to take advantage of the compression and lookup

956

# algorithms used for the other characters.

957

958

pua_index = NAMED_SEQUENCES_START

959

with open_data(NAMED_SEQUENCES, version) as file:

960

for s in file:

961

s = s.strip()

962

if not s or s.startswith('#'):

963

continue

964

name, chars = s.split(';')

965

chars = tuple(int(char, 16) for char in chars.split())

966

# check that the structure defined in makeunicodename is OK

967

assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"

968

assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "

969

"the NamedSequence struct and in unicodedata_lookup")

970

self.named_sequences.append((name, chars))

971

# also store these in the PUA 1

972

self.table[pua_index][1] = name

973

pua_index += 1

974

assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)

975

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

976

self.exclusions = {}

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

977

with open_data(COMPOSITION_EXCLUSIONS, version) as file:

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

char = int(s.split()[0],16)

985

self.exclusions[char] = 1

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

986

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

987

widths = [None] * 0x110000

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

988

with open_data(EASTASIAN_WIDTH, version) as file:

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

s = s.split()[0].split(';')

996

if '..' in s[0]:

997

first, last = [int(c, 16) for c in s[0].split('..')]

998

chars = list(range(first, last+1))

999

else:

1000

chars = [int(s[0], 16)]

for char in chars:

widths[char] = s[1]

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

1004

for i in range(0, 0x110000):

1005

if table[i] is not None:

1006

table[i].append(widths[i])

1007

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1008

for i in range(0, 0x110000):

1009

if table[i] is not None:

1010

table[i].append(set())

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1011

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1012

with open_data(DERIVED_CORE_PROPERTIES, version) as file:

1013

for s in file:

1014

s = s.split('#', 1)[0].strip()

1015

if not s:

1016

continue

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1017

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

r, p = s.split(";")

r = r.strip()

p = p.strip()

if ".." in r:

first, last = [int(c, 16) for c in r.split('..')]

1023

chars = list(range(first, last+1))

else:

chars = [int(r, 16)]

for char in chars:

if table[char]:

# Some properties (e.g. Default_Ignorable_Code_Point)

1029

# apply to unassigned code points; ignore them

1030

table[char][-1].add(p)

1031

1032

with open_data(LINE_BREAK, version) as file:

1033

for s in file:

1034

s = s.partition('#')[0]

1035

s = [i.strip() for i in s.split(';')]

1036

if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:

1037

continue

1038

if '..' not in s[0]:

1039

first = last = int(s[0], 16)

1040

else:

1041

first, last = [int(c, 16) for c in s[0].split('..')]

1042

for char in range(first, last+1):

1043

table[char][-1].add('Line_Break')

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

1044

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1045

# We only want the quickcheck properties

1046

# Format: NF?_QC; Y(es)/N(o)/M(aybe)

1047

# Yes is the default, hence only N and M occur

1048

# In 3.2.0, the format was different (NF?_NO)

1049

# The parsing will incorrectly determine these as

1050

# "yes", however, unicodedata.c will not perform quickchecks

1051

# for older versions, and no delta records will be created.

1052

quickchecks = [0] * 0x110000

1053

qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1054

with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:

for s in file:

if '#' in s:

s = s[:s.index('#')]

s = [i.strip() for i in s.split(';')]

1059

if len(s) < 2 or s[1] not in qc_order:

1060

continue

1061

quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No

1062

quickcheck_shift = qc_order.index(s[1])*2

1063

quickcheck <<= quickcheck_shift

1064

if '..' not in s[0]:

1065

first = last = int(s[0], 16)

1066

else:

1067

first, last = [int(c, 16) for c in s[0].split('..')]

1068

for char in range(first, last+1):

1069

assert not (quickchecks[char]>>quickcheck_shift)&3

1070

quickchecks[char] |= quickcheck

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1071

for i in range(0, 0x110000):

1072

if table[i] is not None:

1073

table[i].append(quickchecks[i])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

1074

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1075

with open_data(UNIHAN, version) as file:

1076

zip = zipfile.ZipFile(file)

1077

if version == '3.2.0':

1078

data = zip.open('Unihan-3.2.0.txt').read()

1079

else:

1080

data = zip.open('Unihan_NumericValues.txt').read()

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1081

for line in data.decode("utf-8").splitlines():

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

1082

if not line.startswith('U+'):

1083

continue

1084

code, tag, value = line.split(None, 3)[:3]

1085

if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',

1086

'kOtherNumeric'):

1087

continue

1088

value = value.strip().replace(',', '')

1089

i = int(code[2:], 16)

1090

# Patch the numeric field

1091

if table[i] is not None:

1092

table[i][8] = value

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame^]

1093

sc = self.special_casing = {}

1094

with open_data(SPECIAL_CASING, version) as file:

1095

for s in file:

1096

s = s[:-1].split('#', 1)[0]

if not s:

continue

data = s.split("; ")

if data[4]:

# We ignore all conditionals (since they depend on

1102

# languages) except for one, which is hardcoded. See

1103

# handle_capital_sigma in unicodeobject.c.

1104

continue

1105

c = int(data[0], 16)

1106

lower = [int(char, 16) for char in data[1].split()]

1107

title = [int(char, 16) for char in data[2].split()]

1108

upper = [int(char, 16) for char in data[3].split()]

1109

sc[c] = (lower, title, upper)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

1110

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1111

def uselatin1(self):

1112

# restrict character range to ISO Latin 1

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

1113

self.chars = list(range(256))

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1114

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1115

# hash table tools

1116

1117

# this is a straight-forward reimplementation of Python's built-in

1118

# dictionary type, using a static data structure, and a custom string

1119

# hash algorithm.

1120

1121

def myhash(s, magic):

1122

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1123

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1124

h = (h * magic) + c

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

1125

ix = h & 0xff000000

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1126

if ix:

1127

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

return h

SIZES = [

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

1132

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

1133

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

1134

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

]

class Hash:

def __init__(self, name, data, magic):

1139

# turn a (key, value) list into a static hash table structure

1140

1141

# determine table size

1142

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

Ezio Melotti

1392500

2011-03-16 11:05:33 +0200

[diff] [blame]

1147

raise AssertionError("ran out of polynomials")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1148

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1149

print(size, "slots in hash table")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1150

1151

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

1160

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

incr = (h ^ (h >> 3)) & mask;

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1181

print(n, "collisions")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1182

self.collisions = n

1183

1184

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1194

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1195

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1196

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1197

file.write("#define %s_magic %d\n" % (self.name, self.magic))

1198

file.write("#define %s_size %d\n" % (self.name, self.size))

1199

file.write("#define %s_poly %d\n" % (self.name, self.poly))

1200

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1201

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1209

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1210

# write data to file, as a C array

1211

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1212

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1213

print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1214

file.write("static ")

1215

if size == 1:

1216

file.write("unsigned char")

1217

elif size == 2:

1218

file.write("unsigned short")

1219

else:

1220

file.write("unsigned int")

1221

file.write(" " + self.name + "[] = {\n")

1222

if self.data:

1223

s = " "

1224

for item in self.data:

1225

i = str(item) + ", "

1226

if len(s) + len(i) > 78:

file.write(s + "\n")

s = " " + i

else:

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1231

if s.strip():

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

file.write(s + "\n")

file.write("};\n\n")

def getsize(data):

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1245

def splitbins(t, trace=0):

1246

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

1247

1248

t is a sequence of ints. This function can be useful to save space if

1249

many of the ints are the same. t1 and t2 are lists of ints, and shift

1250

is an int, chosen to minimize the combined size of t1 and t2 (in C

1251

code), and where for each i in range(len(t)),

1252

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1253

where mask is a bitmask isolating the last "shift" bits.

1254

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1255

If optional arg trace is non-zero (default zero), progress info

1256

is printed to sys.stderr. The higher the value, the more info

1257

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1258

"""

1259

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1260

if trace:

1261

def dump(t1, t2, shift, bytes):

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1262

print("%d+%d bins at shift %d; %d bytes" % (

1263

len(t1), len(t2), shift, bytes), file=sys.stderr)

1264

print("Size of original table:", len(t)*getsize(t), \

1265

"bytes", file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1266

n = len(t)-1 # last valid index

1267

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

Christian Heimes

a37d4c6

2007-12-04 23:02:19 +0000

[diff] [blame]

1273

bytes = sys.maxsize # smallest total size so far

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1274

t = tuple(t) # so slices can be dict keys

1275

for shift in range(maxshift + 1):

1276

t1 = []

1277

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1278

size = 2**shift

1279

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1280

for i in range(0, len(t), size):

1281

bin = t[i:i+size]

1282

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1283

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1284

index = len(t2)

1285

bincache[bin] = index

1286

t2.extend(bin)

1287

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1288

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1289

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1290

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1291

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1292

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1293

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1294

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1295

t1, t2, shift = best

1296

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1297

print("Best:", end=' ', file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1298

dump(t1, t2, shift, bytes)

1299

if __debug__:

1300

# exhaustively verify that the decomposition is correct

1301

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

Guido van Rossum

805365e

2007-05-07 22:24:25 +0000

[diff] [blame]

1302

for i in range(len(t)):

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1303

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1304

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1305

1306

if __name__ == "__main__":

Fredrik Lundh