Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython2

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

4

# this script converts a unicode 3.2 database file to

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

5

# Modules/unicodedata_db.h, Modules/unicodename_db.h,

6

# and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

7

#

8

# history:

9

# 2000-09-24 fl created (based on bits and pieces from unidb)

10

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

11

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

12

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

13

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

14

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

15

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

16

# 2002-09-11 wd use string methods

17

# 2002-10-18 mvl update to Unicode 3.2

18

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

19

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

20

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

21

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

22

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

23

# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

24

# 2011-10-21 ezio add support for name aliases and named sequences

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

25

# 2012-01 benjamin add full case mappings

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

26

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

27

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

28

#

29

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

import os

import sys

import zipfile

from textwrap import dedent

35

from operator import itemgetter

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

36

37

SCRIPT = sys.argv[0]

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

38

VERSION = "3.2"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

39

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

40

# The Unicode Database

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

41

UNIDATA_VERSION = "6.0.0"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

42

UNICODE_DATA = "UnicodeData%s.txt"

43

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

44

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

45

UNIHAN = "Unihan%s.zip"

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

46

DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

47

DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

48

LINE_BREAK = "LineBreak%s.txt"

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

49

NAME_ALIASES = "NameAliases%s.txt"

50

NAMED_SEQUENCES = "NamedSequences%s.txt"

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

51

SPECIAL_CASING = "SpecialCasing%s.txt"

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

52

CASE_FOLDING = "CaseFolding%s.txt"

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

53

54

# Private Use Areas -- in planes 1, 15, 16

55

PUA_1 = range(0xE000, 0xF900)

56

PUA_15 = range(0xF0000, 0xFFFFE)

57

PUA_16 = range(0x100000, 0x10FFFE)

58

59

# we use this ranges of PUA_15 to store name aliases and named sequences

60

NAME_ALIASES_START = 0xF0000

61

NAMED_SEQUENCES_START = 0xF0100

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

62

63

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

64

65

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

66

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

67

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

68

"So" ]

69

70

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

71

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

72

"ON" ]

73

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

74

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

75

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

76

MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]

77

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

78

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

83

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

84

SPACE_MASK = 0x20

85

TITLE_MASK = 0x40

86

UPPER_MASK = 0x80

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

87

XID_START_MASK = 0x100

88

XID_CONTINUE_MASK = 0x200

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

89

PRINTABLE_MASK = 0x400

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

90

NUMERIC_MASK = 0x800

91

CASE_IGNORABLE_MASK = 0x1000

92

CASED_MASK = 0x2000

93

EXTENDED_CASE_MASK = 0x4000

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

94

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

95

# these ranges need to match unicodedata.c:is_unified_ideograph

cjk_ranges = [

('3400', '4DB5'),

('4E00', '9FCB'),

('20000', '2A6D6'),

('2A700', '2B734'),

('2B740', '2B81D')

]

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

104

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

105

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

106

print("--- Reading", UNICODE_DATA % "", "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

107

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

108

version = ""

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

109

unicode = UnicodeData(UNIDATA_VERSION)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

110

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

111

print(len(list(filter(None, unicode.table))), "characters")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

112

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

113

for version in old_versions:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

114

print("--- Reading", UNICODE_DATA % ("-"+version), "...")

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

115

old_unicode = UnicodeData(version, cjk_check=False)

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

116

print(len(list(filter(None, old_unicode.table))), "characters")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

117

merge_old_version(version, unicode, old_unicode)

118

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

119

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

120

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

121

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

122

123

# --------------------------------------------------------------------

124

# unicode character properties

125

126

def makeunicodedata(unicode, trace):

127

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

128

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

129

table = [dummy]

130

cache = {0: dummy}

131

index = [0] * len(unicode.chars)

132

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

133

FILE = "Modules/unicodedata_db.h"

134

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

135

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

136

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

137

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

138

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

139

for char in unicode.chars:

140

record = unicode.table[char]

141

if record:

142

# extract database properties

143

category = CATEGORY_NAMES.index(record[2])

144

combining = int(record[3])

145

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

146

mirrored = record[9] == "Y"

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

147

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

148

normalizationquickcheck = record[17]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

149

item = (

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

150

category, combining, bidirectional, mirrored, eastasianwidth,

151

normalizationquickcheck

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

152

)

153

# add entry to index and item tables

154

i = cache.get(item)

155

if i is None:

156

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

160

# 2) decomposition data

161

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

162

decomp_data = [0]

163

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

164

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

165

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

166

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

167

comp_pairs = []

168

comp_first = [None] * len(unicode.chars)

169

comp_last = [None] * len(unicode.chars)

170

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

171

for char in unicode.chars:

172

record = unicode.table[char]

173

if record:

174

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

175

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

176

if len(decomp) > 19:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

177

raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

178

# prefix

179

if decomp[0][0] == "<":

180

prefix = decomp.pop(0)

181

else:

182

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

183

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

184

i = decomp_prefix.index(prefix)

185

except ValueError:

186

i = len(decomp_prefix)

187

decomp_prefix.append(prefix)

188

prefix = i

189

assert prefix < 256

190

# content

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

191

decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

192

# Collect NFC pairs

193

if not prefix and len(decomp) == 3 and \

194

char not in unicode.exclusions and \

195

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

200

try:

201

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

202

except ValueError:

203

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

204

decomp_data.extend(decomp)

205

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

206

else:

207

i = 0

208

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

209

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

210

f = l = 0

211

comp_first_ranges = []

212

comp_last_ranges = []

213

prev_f = prev_l = None

214

for i in unicode.chars:

215

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

221

prev_f = prev_f[0],i

222

else:

223

comp_first_ranges.append(prev_f)

224

prev_f = (i,i)

225

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

231

prev_l = prev_l[0],i

232

else:

233

comp_last_ranges.append(prev_l)

234

prev_l = (i,i)

235

comp_first_ranges.append(prev_f)

236

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

241

for f,l,char in comp_pairs:

242

f = comp_first[f]

243

l = comp_last[l]

244

comp_data[f*total_last+l] = char

245

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

246

print(len(table), "unique properties")

247

print(len(decomp_prefix), "unique decomposition prefixes")

248

print(len(decomp_data), "unique decomposition entries:", end=' ')

249

print(decomp_size, "bytes")

250

print(total_first, "first characters in NFC")

251

print(total_last, "last characters in NFC")

252

print(len(comp_pairs), "NFC pairs")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

253

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

254

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

255

Fred Drake

9c68505

2000-10-26 03:56:46 +0000

[diff] [blame]

256

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

257

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

258

print(file=fp)

259

print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)

260

print("/* a list of unique database records */", file=fp)

261

print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

262

for item in table:

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

263

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

264

print("};", file=fp)

265

print(file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

266

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

267

print("/* Reindexing of NFC first characters. */", file=fp)

268

print("#define TOTAL_FIRST",total_first, file=fp)

269

print("#define TOTAL_LAST",total_last, file=fp)

270

print("struct reindex{int start;short count,index;};", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

271

print("static struct reindex nfc_first[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

272

for start,end in comp_first_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

273

print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)

274

print(" {0,0,0}", file=fp)

275

print("};\n", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

276

print("static struct reindex nfc_last[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

277

for start,end in comp_last_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

278

print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)

279

print(" {0,0,0}", file=fp)

280

print("};\n", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

281

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

282

# FIXME: <fl> the following tables could be made static, and

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

283

# the support code moved into unicodedatabase.c

284

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

285

print("/* string literals */", file=fp)

286

print("const char *_PyUnicode_CategoryNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

287

for name in CATEGORY_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

288

print(" \"%s\"," % name, file=fp)

289

print(" NULL", file=fp)

290

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

291

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

292

print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

293

for name in BIDIRECTIONAL_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

294

print(" \"%s\"," % name, file=fp)

295

print(" NULL", file=fp)

296

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

297

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

298

print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

299

for name in EASTASIANWIDTH_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

300

print(" \"%s\"," % name, file=fp)

301

print(" NULL", file=fp)

302

print("};", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

303

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

304

print("static const char *decomp_prefix[] = {", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

305

for name in decomp_prefix:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

306

print(" \"%s\"," % name, file=fp)

307

print(" NULL", file=fp)

308

print("};", file=fp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

309

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

310

# split record index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

311

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

312

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

313

print("/* index tables for the database records */", file=fp)

314

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

315

Array("index1", index1).dump(fp, trace)

316

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

317

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

318

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

319

index1, index2, shift = splitbins(decomp_index, trace)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

320

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

321

print("/* decomposition data */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

322

Array("decomp_data", decomp_data).dump(fp, trace)

323

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

324

print("/* index tables for the decomposition data */", file=fp)

325

print("#define DECOMP_SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

326

Array("decomp_index1", index1).dump(fp, trace)

327

Array("decomp_index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

328

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

329

index, index2, shift = splitbins(comp_data, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

330

print("/* NFC pairs */", file=fp)

331

print("#define COMP_SHIFT", shift, file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

332

Array("comp_index", index).dump(fp, trace)

333

Array("comp_data", index2).dump(fp, trace)

334

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

335

# Generate delta tables for old versions

336

for version, table, normalization in unicode.changed:

337

cversion = version.replace(".","_")

338

records = [table[0]]

339

cache = {table[0]:0}

340

index = [0] * len(table)

341

for i, record in enumerate(table):

342

try:

343

index[i] = cache[record]

344

except KeyError:

345

index[i] = cache[record] = len(records)

346

records.append(record)

347

index1, index2, shift = splitbins(index, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

348

print("static const change_record change_records_%s[] = {" % cversion, file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

349

for record in records:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

350

print("\t{ %s }," % ", ".join(map(str,record)), file=fp)

351

print("};", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

352

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

353

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

354

print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)

355

print("{", file=fp)

356

print("\tint index;", file=fp)

357

print("\tif (n >= 0x110000) index = 0;", file=fp)

358

print("\telse {", file=fp)

359

print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)

360

print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \

361

(cversion, shift, ((1<<shift)-1)), file=fp)

362

print("\t}", file=fp)

363

print("\treturn change_records_%s+index;" % cversion, file=fp)

364

print("}\n", file=fp)

365

print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)

366

print("{", file=fp)

367

print("\tswitch(n) {", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

368

for k, v in normalization:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

369

print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)

370

print("\tdefault: return 0;", file=fp)

371

print("\t}\n}\n", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

372

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

373

fp.close()

374

375

# --------------------------------------------------------------------

376

# unicode character type tables

377

378

def makeunicodetype(unicode, trace):

379

380

FILE = "Objects/unicodetype_db.h"

381

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

382

print("--- Preparing", FILE, "...")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

383

384

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

385

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

386

table = [dummy]

387

cache = {0: dummy}

388

index = [0] * len(unicode.chars)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

389

numeric = {}

390

spaces = []

391

linebreaks = []

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

392

extra_casing = []

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

393

394

for char in unicode.chars:

395

record = unicode.table[char]

396

if record:

397

# extract database properties

398

category = record[2]

399

bidirectional = record[4]

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

400

properties = record[16]

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

401

flags = 0

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

402

delta = True

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

403

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

404

flags |= ALPHA_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

405

if "Lowercase" in properties:

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

406

flags |= LOWER_MASK

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

407

if 'Line_Break' in properties or bidirectional == "B":

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

408

flags |= LINEBREAK_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

409

linebreaks.append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

410

if category == "Zs" or bidirectional in ("WS", "B", "S"):

411

flags |= SPACE_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

412

spaces.append(char)

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

413

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

414

flags |= TITLE_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

415

if "Uppercase" in properties:

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

416

flags |= UPPER_MASK

Benjamin Peterson

0983274

2009-03-26 17:15:46 +0000

[diff] [blame]

417

if char == ord(" ") or category[0] not in ("C", "Z"):

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

418

flags |= PRINTABLE_MASK

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

419

if "XID_Start" in properties:

420

flags |= XID_START_MASK

421

if "XID_Continue" in properties:

422

flags |= XID_CONTINUE_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

423

if "Cased" in properties:

424

flags |= CASED_MASK

425

if "Case_Ignorable" in properties:

426

flags |= CASE_IGNORABLE_MASK

427

sc = unicode.special_casing.get(char)

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

428

cf = unicode.case_folding.get(char, [char])

429

if record[12]:

430

upper = int(record[12], 16)

else:

upper = char

if record[13]:

lower = int(record[13], 16)

else:

lower = char

if record[14]:

title = int(record[14], 16)

439

else:

440

title = upper

441

if sc is None and cf != [lower]:

442

sc = ([lower], [title], [upper])

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

443

if sc is None:

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

444

if upper == lower == title:

445

upper = lower = title = 0

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

446

else:

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

447

# This happens either when some character maps to more than one

448

# character in uppercase, lowercase, or titlecase or the

449

# casefolded version of the character is different from the

450

# lowercase. The extra characters are stored in a different

451

# array.

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

452

flags |= EXTENDED_CASE_MASK

453

lower = len(extra_casing) | (len(sc[0]) << 24)

454

extra_casing.extend(sc[0])

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

455

if cf != sc[0]:

456

lower |= len(cf) << 20

457

extra_casing.extend(cf)

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

458

upper = len(extra_casing) | (len(sc[2]) << 24)

459

extra_casing.extend(sc[2])

460

# Title is probably equal to upper.

if sc[1] == sc[2]:

title = upper

else:

title = len(extra_casing) | (len(sc[1]) << 24)

465

extra_casing.extend(sc[1])

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

466

# decimal digit, integer digit

467

decimal = 0

468

if record[6]:

469

flags |= DECIMAL_MASK

470

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

475

if record[8]:

476

flags |= NUMERIC_MASK

477

numeric.setdefault(record[8], []).append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

478

item = (

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

479

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

480

)

481

# add entry to index and item tables

482

i = cache.get(item)

483

if i is None:

484

cache[item] = i = len(table)

table.append(item)

index[char] = i

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

488

print(len(table), "unique character type entries")

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

489

print(sum(map(len, numeric.values())), "numeric code points")

490

print(len(spaces), "whitespace code points")

491

print(len(linebreaks), "linebreak code points")

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

492

print(len(extra_casing), "extended case array")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

493

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

494

print("--- Writing", FILE, "...")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

495

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

496

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

497

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

498

print(file=fp)

499

print("/* a list of unique character type descriptors */", file=fp)

500

print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

501

for item in table:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

502

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

503

print("};", file=fp)

504

print(file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

505

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

506

print("/* extended case mappings */", file=fp)

507

print(file=fp)

508

print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)

509

for c in extra_casing:

510

print(" %d," % c, file=fp)

print("};", file=fp)

print(file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

514

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

515

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

516

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

517

print("/* type indexes */", file=fp)

518

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

519

Array("index1", index1).dump(fp, trace)

520

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

521

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

522

# Generate code for _PyUnicode_ToNumeric()

523

numeric_items = sorted(numeric.items())

524

print('/* Returns the numeric value as double for Unicode characters', file=fp)

525

print(' * having this property, -1.0 otherwise.', file=fp)

526

print(' */', file=fp)

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

527

print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

528

print('{', file=fp)

529

print(' switch (ch) {', file=fp)

530

for value, codepoints in numeric_items:

Amaury Forgeot d'Arc

919765a

2009-10-13 23:18:53 +0000

[diff] [blame]

531

# Turn text into float literals

532

parts = value.split('/')

533

parts = [repr(float(part)) for part in parts]

534

value = '/'.join(parts)

535

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

536

codepoints.sort()

537

for codepoint in codepoints:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

538

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

539

print(' return (double) %s;' % (value,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

540

print(' }', file=fp)

541

print(' return -1.0;', file=fp)

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsWhitespace()

546

print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)

547

print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)

548

print(" */", file=fp)

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

549

print('int _PyUnicode_IsWhitespace(register const Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

550

print('{', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

551

print(' switch (ch) {', file=fp)

552

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

553

for codepoint in sorted(spaces):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

554

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

555

print(' return 1;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

556

557

print(' }', file=fp)

558

print(' return 0;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsLinebreak()

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

563

print("/* Returns 1 for Unicode characters having the line break", file=fp)

564

print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)

565

print(" * type 'B', 0 otherwise.", file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

566

print(" */", file=fp)

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

567

print('int _PyUnicode_IsLinebreak(register const Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

568

print('{', file=fp)

569

print(' switch (ch) {', file=fp)

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

570

for codepoint in sorted(linebreaks):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

571

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

572

print(' return 1;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

573

574

print(' }', file=fp)

575

print(' return 0;', file=fp)

print('}', file=fp)

print(file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

579

fp.close()

580

581

# --------------------------------------------------------------------

582

# unicode name database

583

584

def makeunicodename(unicode, trace):

585

586

FILE = "Modules/unicodename_db.h"

587

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

588

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

589

590

# collect names

591

names = [None] * len(unicode.chars)

592

593

for char in unicode.chars:

594

record = unicode.table[char]

595

if record:

596

name = record[1].strip()

597

if name and name[0] != "<":

598

names[char] = name + chr(0)

599

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

600

print(len(list(n for n in names if n is not None)), "distinct names")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

601

602

# collect unique words from names (note that we differ between

603

# words inside a sentence, and words ending a sentence. the

604

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

620

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

621

print(n, "words in text;", b, "bytes")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

622

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

623

wordlist = list(words.items())

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

624

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

625

# sort on falling frequency, then by name

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

626

def word_key(a):

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

627

aword, alist = a

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

628

return -len(alist), aword

629

wordlist.sort(key=word_key)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

630

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

631

# figure out how many phrasebook escapes we need

632

escapes = 0

633

while escapes * 256 < len(wordlist):

634

escapes = escapes + 1

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

635

print(escapes, "escapes")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

636

637

short = 256 - escapes

assert short > 0

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

641

print(short, "short indexes in lexicon")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

642

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

643

# statistics

644

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

645

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

646

n = n + len(wordlist[i][1])

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

647

print(n, "short indexes in phrasebook")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

648

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

649

# pick the most commonly used words, and sort the rest on falling

650

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

651

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

652

wordlist, wordtail = wordlist[:short], wordlist[short:]

Raymond Hettinger

d4cb56d

2008-01-30 02:55:10 +0000

[diff] [blame]

653

wordtail.sort(key=lambda a: a[0], reverse=True)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

654

wordlist.extend(wordtail)

655

656

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

663

offset = 0

664

for w, x in wordlist:

665

# encoding: bit 7 indicates last character in word (chr(128)

666

# indicates the last character in an entire string)

667

ww = w[:-1] + chr(ord(w[-1])+128)

668

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

669

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

670

if o < 0:

671

o = offset

672

lexicon = lexicon + ww

673

offset = offset + len(w)

674

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

675

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

676

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

677

lexicon = list(map(ord, lexicon))

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

678

679

# generate phrasebook from names and lexicon

680

phrasebook = [0]

681

phrasebook_offset = [0] * len(unicode.chars)

682

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

687

for w in w:

688

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

689

if i < short:

690

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

691

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

692

# store as two bytes

693

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

694

phrasebook.append(i&255)

695

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

696

assert getsize(phrasebook) == 1

697

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

698

#

699

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

704

record = unicode.table[char]

705

if record:

706

name = record[1].strip()

707

if name and name[0] != "<":

708

data.append((name, char))

709

710

# the magic number 47 was chosen to minimize the number of

711

# collisions on the current data set. if you like, change it

712

# and see what happens...

713

714

codehash = Hash("code", data, 47)

715

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

716

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

717

718

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

719

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

720

print(file=fp)

721

print("#define NAME_MAXLEN", 256, file=fp)

722

print(file=fp)

723

print("/* lexicon */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

724

Array("lexicon", lexicon).dump(fp, trace)

725

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

726

727

# split decomposition index table

728

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

729

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

730

print("/* code->name phrasebook */", file=fp)

731

print("#define phrasebook_shift", shift, file=fp)

732

print("#define phrasebook_short", short, file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

733

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

734

Array("phrasebook", phrasebook).dump(fp, trace)

735

Array("phrasebook_offset1", offset1).dump(fp, trace)

736

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

737

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

738

print("/* name->code dictionary */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

739

codehash.dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

740

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

741

print(file=fp)

742

print('static const unsigned int aliases_start = %#x;' %

743

NAME_ALIASES_START, file=fp)

744

print('static const unsigned int aliases_end = %#x;' %

745

(NAME_ALIASES_START + len(unicode.aliases)), file=fp)

746

747

print('static const unsigned int name_aliases[] = {', file=fp)

748

for name, codepoint in unicode.aliases:

749

print(' 0x%04X,' % codepoint, file=fp)

750

print('};', file=fp)

751

752

# In Unicode 6.0.0, the sequences contain at most 4 BMP chars,

753

# so we are using Py_UCS2 seq[4]. This needs to be updated if longer

754

# sequences or sequences with non-BMP chars are added.

755

# unicodedata_lookup should be adapted too.

756

print(dedent("""

757

typedef struct NamedSequence {

int seqlen;

Py_UCS2 seq[4];

} named_sequence;

"""), file=fp)

print('static const unsigned int named_sequences_start = %#x;' %

764

NAMED_SEQUENCES_START, file=fp)

765

print('static const unsigned int named_sequences_end = %#x;' %

766

(NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)

767

768

print('static const named_sequence named_sequences[] = {', file=fp)

769

for name, sequence in unicode.named_sequences:

770

seq_str = ', '.join('0x%04X' % cp for cp in sequence)

771

print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)

772

print('};', file=fp)

773

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

774

fp.close()

775

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

776

777

def merge_old_version(version, new, old):

778

# Changes to exclusion file not implemented yet

779

if old.exclusions != new.exclusions:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

780

raise NotImplementedError("exclusions differ")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

781

782

# In these change records, 0xFF means "no change"

783

bidir_changes = [0xFF]*0x110000

784

category_changes = [0xFF]*0x110000

785

decimal_changes = [0xFF]*0x110000

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

786

mirrored_changes = [0xFF]*0x110000

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

787

# In numeric data, 0 means "no change",

788

# -1 means "did not have a numeric value

789

numeric_changes = [0] * 0x110000

790

# normalization_changes is a list of key-value pairs

791

normalization_changes = []

792

for i in range(0x110000):

793

if new.table[i] is None:

794

# Characters unassigned in the new version ought to

795

# be unassigned in the old one

796

assert old.table[i] is None

797

continue

798

# check characters unassigned in the old version

799

if old.table[i] is None:

800

# category 0 is "unassigned"

801

category_changes[i] = 0

802

continue

803

# check characters that differ

804

if old.table[i] != new.table[i]:

805

for k in range(len(old.table[i])):

806

if old.table[i][k] != new.table[i][k]:

807

value = old.table[i][k]

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

808

if k == 1 and i in PUA_15:

809

# the name is not set in the old.table, but in the

810

# new.table we are using it for aliases and named seq

811

assert value == ''

812

elif k == 2:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

813

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

814

category_changes[i] = CATEGORY_NAMES.index(value)

815

elif k == 4:

816

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

817

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

818

elif k == 5:

819

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

820

# We assume that all normalization changes are in 1:1 mappings

821

assert " " not in value

822

normalization_changes.append((i, value))

823

elif k == 6:

824

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

825

# we only support changes where the old value is a single digit

826

assert value in "0123456789"

827

decimal_changes[i] = int(value)

828

elif k == 8:

829

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

830

# Since 0 encodes "no change", the old value is better not 0

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

831

if not value:

832

numeric_changes[i] = -1

833

else:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

834

numeric_changes[i] = float(value)

835

assert numeric_changes[i] not in (0, -1)

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

836

elif k == 9:

837

if value == 'Y':

838

mirrored_changes[i] = '1'

839

else:

840

mirrored_changes[i] = '0'

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

841

elif k == 11:

842

# change to ISO comment, ignore

843

pass

844

elif k == 12:

845

# change to simple uppercase mapping; ignore

846

pass

847

elif k == 13:

848

# change to simple lowercase mapping; ignore

849

pass

850

elif k == 14:

851

# change to simple titlecase mapping; ignore

852

pass

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

853

elif k == 16:

854

# derived property changes; not yet

855

pass

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

856

elif k == 17:

857

# normalization quickchecks are not performed

858

# for older versions

859

pass

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

860

else:

861

class Difference(Exception):pass

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

862

raise Difference(hex(i), k, old.table[i], new.table[i])

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

863

new.changed.append((version, list(zip(bidir_changes, category_changes,

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

864

decimal_changes, mirrored_changes,

865

numeric_changes)),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

866

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

867

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

868

def open_data(template, version):

869

local = template % ('-'+version,)

870

if not os.path.exists(local):

871

import urllib.request

872

if version == '3.2.0':

873

# irregular url structure

874

url = 'http://www.unicode.org/Public/3.2-Update/' + local

875

else:

876

url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')

877

urllib.request.urlretrieve(url, filename=local)

878

if local.endswith('.txt'):

879

return open(local, encoding='utf-8')

880

else:

881

# Unihan.zip

882

return open(local, 'rb')

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

883

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

884

# --------------------------------------------------------------------

885

# the following support code is taken from the unidb utilities

886

887

888

# load a unicode-data file from disk

889

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

890

class UnicodeData:

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

891

# Record structure:

892

# [ID, name, category, combining, bidi, decomp, (6)

893

# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)

894

# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)

895

# derived-props] (17)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

896

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

897

def __init__(self, version,

898

linebreakprops=False,

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

899

expand=1,

900

cjk_check=True):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

901

self.changed = []

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

902

table = [None] * 0x110000

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

903

with open_data(UNICODE_DATA, version) as file:

while 1:

s = file.readline()

if not s:

break

s = s.strip().split(";")

909

char = int(s[0], 16)

910

table[char] = s

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

911

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

912

cjk_ranges_found = []

913

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

914

# expand first-last ranges

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

915

if expand:

916

field = None

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

917

for i in range(0, 0x110000):

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

918

s = table[i]

919

if s:

920

if s[1][-6:] == "First>":

921

s[1] = ""

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

922

field = s

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

923

elif s[1][-5:] == "Last>":

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

924

if s[1].startswith("<CJK Ideograph"):

925

cjk_ranges_found.append((field[0],

926

s[0]))

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

927

s[1] = ""

928

field = None

929

elif field:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

930

f2 = field[:]

931

f2[0] = "%X" % i

932

table[i] = f2

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

933

if cjk_check and cjk_ranges != cjk_ranges_found:

934

raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

935

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

936

# public attributes

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

937

self.filename = UNICODE_DATA % ''

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

938

self.table = table

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

939

self.chars = list(range(0x110000)) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

940

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

941

# check for name aliases and named sequences, see #12753

942

# aliases and named sequences are not in 3.2.0

943

if version != '3.2.0':

944

self.aliases = []

945

# store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,

946

# in order to take advantage of the compression and lookup

947

# algorithms used for the other characters

948

pua_index = NAME_ALIASES_START

949

with open_data(NAME_ALIASES, version) as file:

950

for s in file:

951

s = s.strip()

952

if not s or s.startswith('#'):

953

continue

954

char, name = s.split(';')

955

char = int(char, 16)

956

self.aliases.append((name, char))

957

# also store the name in the PUA 1

958

self.table[pua_index][1] = name

959

pua_index += 1

960

assert pua_index - NAME_ALIASES_START == len(self.aliases)

961

962

self.named_sequences = []

963

# store named seqences in the PUA 1, in range U+F0100..,

964

# in order to take advantage of the compression and lookup

965

# algorithms used for the other characters.

966

967

pua_index = NAMED_SEQUENCES_START

968

with open_data(NAMED_SEQUENCES, version) as file:

969

for s in file:

970

s = s.strip()

971

if not s or s.startswith('#'):

972

continue

973

name, chars = s.split(';')

974

chars = tuple(int(char, 16) for char in chars.split())

975

# check that the structure defined in makeunicodename is OK

976

assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"

977

assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "

978

"the NamedSequence struct and in unicodedata_lookup")

979

self.named_sequences.append((name, chars))

980

# also store these in the PUA 1

981

self.table[pua_index][1] = name

982

pua_index += 1

983

assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)

984

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

985

self.exclusions = {}

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

986

with open_data(COMPOSITION_EXCLUSIONS, version) as file:

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

char = int(s.split()[0],16)

994

self.exclusions[char] = 1

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

995

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

996

widths = [None] * 0x110000

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

997

with open_data(EASTASIAN_WIDTH, version) as file:

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

s = s.split()[0].split(';')

1005

if '..' in s[0]:

1006

first, last = [int(c, 16) for c in s[0].split('..')]

1007

chars = list(range(first, last+1))

1008

else:

1009

chars = [int(s[0], 16)]

for char in chars:

widths[char] = s[1]

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

1013

for i in range(0, 0x110000):

1014

if table[i] is not None:

1015

table[i].append(widths[i])

1016

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1017

for i in range(0, 0x110000):

1018

if table[i] is not None:

1019

table[i].append(set())

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1020

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1021

with open_data(DERIVED_CORE_PROPERTIES, version) as file:

1022

for s in file:

1023

s = s.split('#', 1)[0].strip()

1024

if not s:

1025

continue

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1026

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

r, p = s.split(";")

r = r.strip()

p = p.strip()

if ".." in r:

first, last = [int(c, 16) for c in r.split('..')]

1032

chars = list(range(first, last+1))

else:

chars = [int(r, 16)]

for char in chars:

if table[char]:

# Some properties (e.g. Default_Ignorable_Code_Point)

1038

# apply to unassigned code points; ignore them

1039

table[char][-1].add(p)

1040

1041

with open_data(LINE_BREAK, version) as file:

1042

for s in file:

1043

s = s.partition('#')[0]

1044

s = [i.strip() for i in s.split(';')]

1045

if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:

1046

continue

1047

if '..' not in s[0]:

1048

first = last = int(s[0], 16)

1049

else:

1050

first, last = [int(c, 16) for c in s[0].split('..')]

1051

for char in range(first, last+1):

1052

table[char][-1].add('Line_Break')

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

1053

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1054

# We only want the quickcheck properties

1055

# Format: NF?_QC; Y(es)/N(o)/M(aybe)

1056

# Yes is the default, hence only N and M occur

1057

# In 3.2.0, the format was different (NF?_NO)

1058

# The parsing will incorrectly determine these as

1059

# "yes", however, unicodedata.c will not perform quickchecks

1060

# for older versions, and no delta records will be created.

1061

quickchecks = [0] * 0x110000

1062

qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1063

with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:

for s in file:

if '#' in s:

s = s[:s.index('#')]

s = [i.strip() for i in s.split(';')]

1068

if len(s) < 2 or s[1] not in qc_order:

1069

continue

1070

quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No

1071

quickcheck_shift = qc_order.index(s[1])*2

1072

quickcheck <<= quickcheck_shift

1073

if '..' not in s[0]:

1074

first = last = int(s[0], 16)

1075

else:

1076

first, last = [int(c, 16) for c in s[0].split('..')]

1077

for char in range(first, last+1):

1078

assert not (quickchecks[char]>>quickcheck_shift)&3

1079

quickchecks[char] |= quickcheck

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1080

for i in range(0, 0x110000):

1081

if table[i] is not None:

1082

table[i].append(quickchecks[i])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

1083

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1084

with open_data(UNIHAN, version) as file:

1085

zip = zipfile.ZipFile(file)

1086

if version == '3.2.0':

1087

data = zip.open('Unihan-3.2.0.txt').read()

1088

else:

1089

data = zip.open('Unihan_NumericValues.txt').read()

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1090

for line in data.decode("utf-8").splitlines():

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

1091

if not line.startswith('U+'):

1092

continue

1093

code, tag, value = line.split(None, 3)[:3]

1094

if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',

1095

'kOtherNumeric'):

1096

continue

1097

value = value.strip().replace(',', '')

1098

i = int(code[2:], 16)

1099

# Patch the numeric field

1100

if table[i] is not None:

1101

table[i][8] = value

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

1102

sc = self.special_casing = {}

1103

with open_data(SPECIAL_CASING, version) as file:

1104

for s in file:

1105

s = s[:-1].split('#', 1)[0]

if not s:

continue

data = s.split("; ")

if data[4]:

# We ignore all conditionals (since they depend on

1111

# languages) except for one, which is hardcoded. See

1112

# handle_capital_sigma in unicodeobject.c.

1113

continue

1114

c = int(data[0], 16)

1115

lower = [int(char, 16) for char in data[1].split()]

1116

title = [int(char, 16) for char in data[2].split()]

1117

upper = [int(char, 16) for char in data[3].split()]

1118

sc[c] = (lower, title, upper)

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

1119

cf = self.case_folding = {}

1120

if version != '3.2.0':

1121

with open_data(CASE_FOLDING, version) as file:

1122

for s in file:

1123

s = s[:-1].split('#', 1)[0]

if not s:

continue

data = s.split("; ")

if data[1] in "CF":

c = int(data[0], 16)

cf[c] = [int(char, 16) for char in data[2].split()]

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

1130

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1131

def uselatin1(self):

1132

# restrict character range to ISO Latin 1

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

1133

self.chars = list(range(256))

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1134

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1135

# hash table tools

1136

1137

# this is a straight-forward reimplementation of Python's built-in

1138

# dictionary type, using a static data structure, and a custom string

1139

# hash algorithm.

1140

1141

def myhash(s, magic):

1142

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1143

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1144

h = (h * magic) + c

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

1145

ix = h & 0xff000000

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1146

if ix:

1147

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

return h

SIZES = [

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

1152

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

1153

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

1154

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

]

class Hash:

def __init__(self, name, data, magic):

1159

# turn a (key, value) list into a static hash table structure

1160

1161

# determine table size

1162

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

Ezio Melotti

1392500

2011-03-16 11:05:33 +0200

[diff] [blame]

1167

raise AssertionError("ran out of polynomials")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1168

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1169

print(size, "slots in hash table")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1170

1171

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

1180

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

incr = (h ^ (h >> 3)) & mask;

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1201

print(n, "collisions")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1202

self.collisions = n

1203

1204

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1214

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1215

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1216

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1217

file.write("#define %s_magic %d\n" % (self.name, self.magic))

1218

file.write("#define %s_size %d\n" % (self.name, self.size))

1219

file.write("#define %s_poly %d\n" % (self.name, self.poly))

1220

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1221

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1229

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1230

# write data to file, as a C array

1231

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1232

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1233

print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1234

file.write("static ")

1235

if size == 1:

1236

file.write("unsigned char")

1237

elif size == 2:

1238

file.write("unsigned short")

1239

else:

1240

file.write("unsigned int")

1241

file.write(" " + self.name + "[] = {\n")

1242

if self.data:

1243

s = " "

1244

for item in self.data:

1245

i = str(item) + ", "

1246

if len(s) + len(i) > 78:

file.write(s + "\n")

s = " " + i

else:

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1251

if s.strip():

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

file.write(s + "\n")

file.write("};\n\n")

def getsize(data):

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1265

def splitbins(t, trace=0):

1266

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

1267

1268

t is a sequence of ints. This function can be useful to save space if

1269

many of the ints are the same. t1 and t2 are lists of ints, and shift

1270

is an int, chosen to minimize the combined size of t1 and t2 (in C

1271

code), and where for each i in range(len(t)),

1272

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1273

where mask is a bitmask isolating the last "shift" bits.

1274

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1275

If optional arg trace is non-zero (default zero), progress info

1276

is printed to sys.stderr. The higher the value, the more info

1277

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1278

"""

1279

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1280

if trace:

1281

def dump(t1, t2, shift, bytes):

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1282

print("%d+%d bins at shift %d; %d bytes" % (

1283

len(t1), len(t2), shift, bytes), file=sys.stderr)

1284

print("Size of original table:", len(t)*getsize(t), \

1285

"bytes", file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1286

n = len(t)-1 # last valid index

1287

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

Christian Heimes

a37d4c6

2007-12-04 23:02:19 +0000

[diff] [blame]

1293

bytes = sys.maxsize # smallest total size so far

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1294

t = tuple(t) # so slices can be dict keys

1295

for shift in range(maxshift + 1):

1296

t1 = []

1297

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1298

size = 2**shift

1299

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1300

for i in range(0, len(t), size):

1301

bin = t[i:i+size]

1302

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1303

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1304

index = len(t2)

1305

bincache[bin] = index

1306

t2.extend(bin)

1307

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1308

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1309

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1310

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1311

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1312

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1313

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1314

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1315

t1, t2, shift = best

1316

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1317

print("Best:", end=' ', file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1318

dump(t1, t2, shift, bytes)

1319

if __debug__:

1320

# exhaustively verify that the decomposition is correct

1321

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

Guido van Rossum

805365e

2007-05-07 22:24:25 +0000

[diff] [blame]

1322

for i in range(len(t)):

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1323

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1324

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1325

1326

if __name__ == "__main__":

Fredrik Lundh