Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython3

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

4

# this script converts a unicode 3.2 database file to

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

5

# Modules/unicodedata_db.h, Modules/unicodename_db.h,

6

# and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

7

#

8

# history:

9

# 2000-09-24 fl created (based on bits and pieces from unidb)

10

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

11

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

12

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

13

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

14

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

15

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

16

# 2002-09-11 wd use string methods

17

# 2002-10-18 mvl update to Unicode 3.2

18

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

19

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

20

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

21

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

22

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

23

# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

24

# 2011-10-21 ezio add support for name aliases and named sequences

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

25

# 2012-01 benjamin add full case mappings

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

26

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

27

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

28

#

29

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

import os

import sys

import zipfile

from textwrap import dedent

35

from operator import itemgetter

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

36

37

SCRIPT = sys.argv[0]

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

38

VERSION = "3.2"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

39

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

40

# The Unicode Database

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

41

UNIDATA_VERSION = "6.0.0"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

42

UNICODE_DATA = "UnicodeData%s.txt"

43

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

44

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

45

UNIHAN = "Unihan%s.zip"

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

46

DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

47

DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

48

LINE_BREAK = "LineBreak%s.txt"

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

49

NAME_ALIASES = "NameAliases%s.txt"

50

NAMED_SEQUENCES = "NamedSequences%s.txt"

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

51

SPECIAL_CASING = "SpecialCasing%s.txt"

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

52

CASE_FOLDING = "CaseFolding%s.txt"

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

53

54

# Private Use Areas -- in planes 1, 15, 16

55

PUA_1 = range(0xE000, 0xF900)

56

PUA_15 = range(0xF0000, 0xFFFFE)

57

PUA_16 = range(0x100000, 0x10FFFE)

58

59

# we use this ranges of PUA_15 to store name aliases and named sequences

60

NAME_ALIASES_START = 0xF0000

61

NAMED_SEQUENCES_START = 0xF0100

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

62

63

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

64

65

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

66

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

67

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

68

"So" ]

69

70

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

71

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

72

"ON" ]

73

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

74

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

75

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

76

MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]

77

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

78

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

83

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

84

SPACE_MASK = 0x20

85

TITLE_MASK = 0x40

86

UPPER_MASK = 0x80

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

87

XID_START_MASK = 0x100

88

XID_CONTINUE_MASK = 0x200

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

89

PRINTABLE_MASK = 0x400

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

90

NUMERIC_MASK = 0x800

91

CASE_IGNORABLE_MASK = 0x1000

92

CASED_MASK = 0x2000

93

EXTENDED_CASE_MASK = 0x4000

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

94

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

95

# these ranges need to match unicodedata.c:is_unified_ideograph

cjk_ranges = [

('3400', '4DB5'),

('4E00', '9FCB'),

('20000', '2A6D6'),

('2A700', '2B734'),

('2B740', '2B81D')

]

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

104

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

105

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

106

print("--- Reading", UNICODE_DATA % "", "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

107

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

108

version = ""

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

109

unicode = UnicodeData(UNIDATA_VERSION)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

110

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

111

print(len(list(filter(None, unicode.table))), "characters")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

112

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

113

for version in old_versions:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

114

print("--- Reading", UNICODE_DATA % ("-"+version), "...")

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

115

old_unicode = UnicodeData(version, cjk_check=False)

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

116

print(len(list(filter(None, old_unicode.table))), "characters")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

117

merge_old_version(version, unicode, old_unicode)

118

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

119

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

120

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

121

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

122

123

# --------------------------------------------------------------------

124

# unicode character properties

125

126

def makeunicodedata(unicode, trace):

127

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

128

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

129

table = [dummy]

130

cache = {0: dummy}

131

index = [0] * len(unicode.chars)

132

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

133

FILE = "Modules/unicodedata_db.h"

134

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

135

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

136

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

137

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

138

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

139

for char in unicode.chars:

140

record = unicode.table[char]

141

if record:

142

# extract database properties

143

category = CATEGORY_NAMES.index(record[2])

144

combining = int(record[3])

145

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

146

mirrored = record[9] == "Y"

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

147

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

148

normalizationquickcheck = record[17]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

149

item = (

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

150

category, combining, bidirectional, mirrored, eastasianwidth,

151

normalizationquickcheck

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

152

)

153

# add entry to index and item tables

154

i = cache.get(item)

155

if i is None:

156

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

160

# 2) decomposition data

161

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

162

decomp_data = [0]

163

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

164

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

165

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

166

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

167

comp_pairs = []

168

comp_first = [None] * len(unicode.chars)

169

comp_last = [None] * len(unicode.chars)

170

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

171

for char in unicode.chars:

172

record = unicode.table[char]

173

if record:

174

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

175

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

176

if len(decomp) > 19:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

177

raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

178

# prefix

179

if decomp[0][0] == "<":

180

prefix = decomp.pop(0)

181

else:

182

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

183

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

184

i = decomp_prefix.index(prefix)

185

except ValueError:

186

i = len(decomp_prefix)

187

decomp_prefix.append(prefix)

188

prefix = i

189

assert prefix < 256

190

# content

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

191

decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

192

# Collect NFC pairs

193

if not prefix and len(decomp) == 3 and \

194

char not in unicode.exclusions and \

195

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

200

try:

201

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

202

except ValueError:

203

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

204

decomp_data.extend(decomp)

205

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

206

else:

207

i = 0

208

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

209

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

210

f = l = 0

211

comp_first_ranges = []

212

comp_last_ranges = []

213

prev_f = prev_l = None

214

for i in unicode.chars:

215

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

221

prev_f = prev_f[0],i

222

else:

223

comp_first_ranges.append(prev_f)

224

prev_f = (i,i)

225

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

231

prev_l = prev_l[0],i

232

else:

233

comp_last_ranges.append(prev_l)

234

prev_l = (i,i)

235

comp_first_ranges.append(prev_f)

236

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

241

for f,l,char in comp_pairs:

242

f = comp_first[f]

243

l = comp_last[l]

244

comp_data[f*total_last+l] = char

245

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

246

print(len(table), "unique properties")

247

print(len(decomp_prefix), "unique decomposition prefixes")

248

print(len(decomp_data), "unique decomposition entries:", end=' ')

249

print(decomp_size, "bytes")

250

print(total_first, "first characters in NFC")

251

print(total_last, "last characters in NFC")

252

print(len(comp_pairs), "NFC pairs")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

253

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

254

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

255

Fred Drake

9c68505

2000-10-26 03:56:46 +0000

[diff] [blame]

256

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

257

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

258

print(file=fp)

259

print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)

260

print("/* a list of unique database records */", file=fp)

261

print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

262

for item in table:

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

263

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

264

print("};", file=fp)

265

print(file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

266

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

267

print("/* Reindexing of NFC first characters. */", file=fp)

268

print("#define TOTAL_FIRST",total_first, file=fp)

269

print("#define TOTAL_LAST",total_last, file=fp)

270

print("struct reindex{int start;short count,index;};", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

271

print("static struct reindex nfc_first[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

272

for start,end in comp_first_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

273

print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)

274

print(" {0,0,0}", file=fp)

275

print("};\n", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

276

print("static struct reindex nfc_last[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

277

for start,end in comp_last_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

278

print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)

279

print(" {0,0,0}", file=fp)

280

print("};\n", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

281

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

282

# FIXME: <fl> the following tables could be made static, and

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

283

# the support code moved into unicodedatabase.c

284

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

285

print("/* string literals */", file=fp)

286

print("const char *_PyUnicode_CategoryNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

287

for name in CATEGORY_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

288

print(" \"%s\"," % name, file=fp)

289

print(" NULL", file=fp)

290

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

291

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

292

print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

293

for name in BIDIRECTIONAL_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

294

print(" \"%s\"," % name, file=fp)

295

print(" NULL", file=fp)

296

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

297

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

298

print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

299

for name in EASTASIANWIDTH_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

300

print(" \"%s\"," % name, file=fp)

301

print(" NULL", file=fp)

302

print("};", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

303

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

304

print("static const char *decomp_prefix[] = {", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

305

for name in decomp_prefix:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

306

print(" \"%s\"," % name, file=fp)

307

print(" NULL", file=fp)

308

print("};", file=fp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

309

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

310

# split record index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

311

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

312

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

313

print("/* index tables for the database records */", file=fp)

314

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

315

Array("index1", index1).dump(fp, trace)

316

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

317

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

318

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

319

index1, index2, shift = splitbins(decomp_index, trace)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

320

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

321

print("/* decomposition data */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

322

Array("decomp_data", decomp_data).dump(fp, trace)

323

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

324

print("/* index tables for the decomposition data */", file=fp)

325

print("#define DECOMP_SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

326

Array("decomp_index1", index1).dump(fp, trace)

327

Array("decomp_index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

328

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

329

index, index2, shift = splitbins(comp_data, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

330

print("/* NFC pairs */", file=fp)

331

print("#define COMP_SHIFT", shift, file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

332

Array("comp_index", index).dump(fp, trace)

333

Array("comp_data", index2).dump(fp, trace)

334

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

335

# Generate delta tables for old versions

336

for version, table, normalization in unicode.changed:

337

cversion = version.replace(".","_")

338

records = [table[0]]

339

cache = {table[0]:0}

340

index = [0] * len(table)

341

for i, record in enumerate(table):

342

try:

343

index[i] = cache[record]

344

except KeyError:

345

index[i] = cache[record] = len(records)

346

records.append(record)

347

index1, index2, shift = splitbins(index, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

348

print("static const change_record change_records_%s[] = {" % cversion, file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

349

for record in records:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

350

print("\t{ %s }," % ", ".join(map(str,record)), file=fp)

351

print("};", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

352

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

353

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

354

print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)

355

print("{", file=fp)

356

print("\tint index;", file=fp)

357

print("\tif (n >= 0x110000) index = 0;", file=fp)

358

print("\telse {", file=fp)

359

print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)

360

print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \

361

(cversion, shift, ((1<<shift)-1)), file=fp)

362

print("\t}", file=fp)

363

print("\treturn change_records_%s+index;" % cversion, file=fp)

364

print("}\n", file=fp)

365

print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)

366

print("{", file=fp)

367

print("\tswitch(n) {", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

368

for k, v in normalization:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

369

print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)

370

print("\tdefault: return 0;", file=fp)

371

print("\t}\n}\n", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

372

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

373

fp.close()

374

375

# --------------------------------------------------------------------

376

# unicode character type tables

377

378

def makeunicodetype(unicode, trace):

379

380

FILE = "Objects/unicodetype_db.h"

381

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

382

print("--- Preparing", FILE, "...")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

383

384

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

385

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

386

table = [dummy]

387

cache = {0: dummy}

388

index = [0] * len(unicode.chars)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

389

numeric = {}

390

spaces = []

391

linebreaks = []

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

392

extra_casing = []

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

393

394

for char in unicode.chars:

395

record = unicode.table[char]

396

if record:

397

# extract database properties

398

category = record[2]

399

bidirectional = record[4]

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

400

properties = record[16]

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

401

flags = 0

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

402

delta = True

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

403

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

404

flags |= ALPHA_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

405

if "Lowercase" in properties:

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

406

flags |= LOWER_MASK

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

407

if 'Line_Break' in properties or bidirectional == "B":

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

408

flags |= LINEBREAK_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

409

linebreaks.append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

410

if category == "Zs" or bidirectional in ("WS", "B", "S"):

411

flags |= SPACE_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

412

spaces.append(char)

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

413

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

414

flags |= TITLE_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

415

if "Uppercase" in properties:

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

416

flags |= UPPER_MASK

Benjamin Peterson

0983274

2009-03-26 17:15:46 +0000

[diff] [blame]

417

if char == ord(" ") or category[0] not in ("C", "Z"):

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

418

flags |= PRINTABLE_MASK

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

419

if "XID_Start" in properties:

420

flags |= XID_START_MASK

421

if "XID_Continue" in properties:

422

flags |= XID_CONTINUE_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

423

if "Cased" in properties:

424

flags |= CASED_MASK

425

if "Case_Ignorable" in properties:

426

flags |= CASE_IGNORABLE_MASK

427

sc = unicode.special_casing.get(char)

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

428

cf = unicode.case_folding.get(char, [char])

429

if record[12]:

430

upper = int(record[12], 16)

else:

upper = char

if record[13]:

lower = int(record[13], 16)

else:

lower = char

if record[14]:

title = int(record[14], 16)

439

else:

440

title = upper

441

if sc is None and cf != [lower]:

442

sc = ([lower], [title], [upper])

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

443

if sc is None:

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

444

if upper == lower == title:

445

upper = lower = title = 0

Benjamin Peterson

ad9c569

2012-01-15 21:19:20 -0500

[diff] [blame]

else:

upper = upper - char

lower = lower - char

title = title - char

assert (abs(upper) <= 2147483647 and

451

abs(lower) <= 2147483647 and

452

abs(title) <= 2147483647)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

453

else:

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

454

# This happens either when some character maps to more than one

455

# character in uppercase, lowercase, or titlecase or the

456

# casefolded version of the character is different from the

457

# lowercase. The extra characters are stored in a different

458

# array.

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

459

flags |= EXTENDED_CASE_MASK

460

lower = len(extra_casing) | (len(sc[0]) << 24)

461

extra_casing.extend(sc[0])

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

462

if cf != sc[0]:

463

lower |= len(cf) << 20

464

extra_casing.extend(cf)

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

465

upper = len(extra_casing) | (len(sc[2]) << 24)

466

extra_casing.extend(sc[2])

467

# Title is probably equal to upper.

if sc[1] == sc[2]:

title = upper

else:

title = len(extra_casing) | (len(sc[1]) << 24)

472

extra_casing.extend(sc[1])

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

473

# decimal digit, integer digit

474

decimal = 0

475

if record[6]:

476

flags |= DECIMAL_MASK

477

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

482

if record[8]:

483

flags |= NUMERIC_MASK

484

numeric.setdefault(record[8], []).append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

485

item = (

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

486

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

487

)

488

# add entry to index and item tables

489

i = cache.get(item)

490

if i is None:

491

cache[item] = i = len(table)

table.append(item)

index[char] = i

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

495

print(len(table), "unique character type entries")

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

496

print(sum(map(len, numeric.values())), "numeric code points")

497

print(len(spaces), "whitespace code points")

498

print(len(linebreaks), "linebreak code points")

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

499

print(len(extra_casing), "extended case array")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

500

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

501

print("--- Writing", FILE, "...")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

502

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

503

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

504

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

505

print(file=fp)

506

print("/* a list of unique character type descriptors */", file=fp)

507

print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

508

for item in table:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

509

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

510

print("};", file=fp)

511

print(file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

512

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

513

print("/* extended case mappings */", file=fp)

514

print(file=fp)

515

print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)

516

for c in extra_casing:

517

print(" %d," % c, file=fp)

print("};", file=fp)

print(file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

521

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

522

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

523

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

524

print("/* type indexes */", file=fp)

525

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

526

Array("index1", index1).dump(fp, trace)

527

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

528

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

529

# Generate code for _PyUnicode_ToNumeric()

530

numeric_items = sorted(numeric.items())

531

print('/* Returns the numeric value as double for Unicode characters', file=fp)

532

print(' * having this property, -1.0 otherwise.', file=fp)

533

print(' */', file=fp)

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

534

print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

535

print('{', file=fp)

536

print(' switch (ch) {', file=fp)

537

for value, codepoints in numeric_items:

Amaury Forgeot d'Arc

919765a

2009-10-13 23:18:53 +0000

[diff] [blame]

538

# Turn text into float literals

539

parts = value.split('/')

540

parts = [repr(float(part)) for part in parts]

541

value = '/'.join(parts)

542

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

543

codepoints.sort()

544

for codepoint in codepoints:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

545

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

546

print(' return (double) %s;' % (value,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

547

print(' }', file=fp)

548

print(' return -1.0;', file=fp)

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsWhitespace()

553

print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)

554

print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)

555

print(" */", file=fp)

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

556

print('int _PyUnicode_IsWhitespace(register const Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

557

print('{', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

558

print(' switch (ch) {', file=fp)

559

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

560

for codepoint in sorted(spaces):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

561

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

562

print(' return 1;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

563

564

print(' }', file=fp)

565

print(' return 0;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsLinebreak()

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

570

print("/* Returns 1 for Unicode characters having the line break", file=fp)

571

print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)

572

print(" * type 'B', 0 otherwise.", file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

573

print(" */", file=fp)

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

574

print('int _PyUnicode_IsLinebreak(register const Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

575

print('{', file=fp)

576

print(' switch (ch) {', file=fp)

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

577

for codepoint in sorted(linebreaks):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

578

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

579

print(' return 1;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

580

581

print(' }', file=fp)

582

print(' return 0;', file=fp)

print('}', file=fp)

print(file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

586

fp.close()

587

588

# --------------------------------------------------------------------

589

# unicode name database

590

591

def makeunicodename(unicode, trace):

592

593

FILE = "Modules/unicodename_db.h"

594

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

595

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

596

597

# collect names

598

names = [None] * len(unicode.chars)

599

600

for char in unicode.chars:

601

record = unicode.table[char]

602

if record:

603

name = record[1].strip()

604

if name and name[0] != "<":

605

names[char] = name + chr(0)

606

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

607

print(len(list(n for n in names if n is not None)), "distinct names")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

608

609

# collect unique words from names (note that we differ between

610

# words inside a sentence, and words ending a sentence. the

611

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

627

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

628

print(n, "words in text;", b, "bytes")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

629

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

630

wordlist = list(words.items())

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

631

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

632

# sort on falling frequency, then by name

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

633

def word_key(a):

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

634

aword, alist = a

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

635

return -len(alist), aword

636

wordlist.sort(key=word_key)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

637

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

638

# figure out how many phrasebook escapes we need

639

escapes = 0

640

while escapes * 256 < len(wordlist):

641

escapes = escapes + 1

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

642

print(escapes, "escapes")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

643

644

short = 256 - escapes

assert short > 0

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

648

print(short, "short indexes in lexicon")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

649

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

650

# statistics

651

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

652

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

653

n = n + len(wordlist[i][1])

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

654

print(n, "short indexes in phrasebook")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

655

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

656

# pick the most commonly used words, and sort the rest on falling

657

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

658

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

659

wordlist, wordtail = wordlist[:short], wordlist[short:]

Raymond Hettinger

d4cb56d

2008-01-30 02:55:10 +0000

[diff] [blame]

660

wordtail.sort(key=lambda a: a[0], reverse=True)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

661

wordlist.extend(wordtail)

662

663

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

670

offset = 0

671

for w, x in wordlist:

672

# encoding: bit 7 indicates last character in word (chr(128)

673

# indicates the last character in an entire string)

674

ww = w[:-1] + chr(ord(w[-1])+128)

675

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

676

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

677

if o < 0:

678

o = offset

679

lexicon = lexicon + ww

680

offset = offset + len(w)

681

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

682

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

683

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

684

lexicon = list(map(ord, lexicon))

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

685

686

# generate phrasebook from names and lexicon

687

phrasebook = [0]

688

phrasebook_offset = [0] * len(unicode.chars)

689

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

694

for w in w:

695

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

696

if i < short:

697

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

698

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

699

# store as two bytes

700

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

701

phrasebook.append(i&255)

702

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

703

assert getsize(phrasebook) == 1

704

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

705

#

706

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

711

record = unicode.table[char]

712

if record:

713

name = record[1].strip()

714

if name and name[0] != "<":

715

data.append((name, char))

716

717

# the magic number 47 was chosen to minimize the number of

718

# collisions on the current data set. if you like, change it

719

# and see what happens...

720

721

codehash = Hash("code", data, 47)

722

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

723

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

724

725

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

726

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

727

print(file=fp)

728

print("#define NAME_MAXLEN", 256, file=fp)

729

print(file=fp)

730

print("/* lexicon */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

731

Array("lexicon", lexicon).dump(fp, trace)

732

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

733

734

# split decomposition index table

735

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

736

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

737

print("/* code->name phrasebook */", file=fp)

738

print("#define phrasebook_shift", shift, file=fp)

739

print("#define phrasebook_short", short, file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

740

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

741

Array("phrasebook", phrasebook).dump(fp, trace)

742

Array("phrasebook_offset1", offset1).dump(fp, trace)

743

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

744

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

745

print("/* name->code dictionary */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

746

codehash.dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

747

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

748

print(file=fp)

749

print('static const unsigned int aliases_start = %#x;' %

750

NAME_ALIASES_START, file=fp)

751

print('static const unsigned int aliases_end = %#x;' %

752

(NAME_ALIASES_START + len(unicode.aliases)), file=fp)

753

754

print('static const unsigned int name_aliases[] = {', file=fp)

755

for name, codepoint in unicode.aliases:

756

print(' 0x%04X,' % codepoint, file=fp)

757

print('};', file=fp)

758

759

# In Unicode 6.0.0, the sequences contain at most 4 BMP chars,

760

# so we are using Py_UCS2 seq[4]. This needs to be updated if longer

761

# sequences or sequences with non-BMP chars are added.

762

# unicodedata_lookup should be adapted too.

763

print(dedent("""

764

typedef struct NamedSequence {

int seqlen;

Py_UCS2 seq[4];

} named_sequence;

"""), file=fp)

print('static const unsigned int named_sequences_start = %#x;' %

771

NAMED_SEQUENCES_START, file=fp)

772

print('static const unsigned int named_sequences_end = %#x;' %

773

(NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)

774

775

print('static const named_sequence named_sequences[] = {', file=fp)

776

for name, sequence in unicode.named_sequences:

777

seq_str = ', '.join('0x%04X' % cp for cp in sequence)

778

print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)

779

print('};', file=fp)

780

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

781

fp.close()

782

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

783

784

def merge_old_version(version, new, old):

785

# Changes to exclusion file not implemented yet

786

if old.exclusions != new.exclusions:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

787

raise NotImplementedError("exclusions differ")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

788

789

# In these change records, 0xFF means "no change"

790

bidir_changes = [0xFF]*0x110000

791

category_changes = [0xFF]*0x110000

792

decimal_changes = [0xFF]*0x110000

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

793

mirrored_changes = [0xFF]*0x110000

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

794

# In numeric data, 0 means "no change",

795

# -1 means "did not have a numeric value

796

numeric_changes = [0] * 0x110000

797

# normalization_changes is a list of key-value pairs

798

normalization_changes = []

799

for i in range(0x110000):

800

if new.table[i] is None:

801

# Characters unassigned in the new version ought to

802

# be unassigned in the old one

803

assert old.table[i] is None

804

continue

805

# check characters unassigned in the old version

806

if old.table[i] is None:

807

# category 0 is "unassigned"

808

category_changes[i] = 0

809

continue

810

# check characters that differ

811

if old.table[i] != new.table[i]:

812

for k in range(len(old.table[i])):

813

if old.table[i][k] != new.table[i][k]:

814

value = old.table[i][k]

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

815

if k == 1 and i in PUA_15:

816

# the name is not set in the old.table, but in the

817

# new.table we are using it for aliases and named seq

818

assert value == ''

819

elif k == 2:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

820

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

821

category_changes[i] = CATEGORY_NAMES.index(value)

822

elif k == 4:

823

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

824

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

825

elif k == 5:

826

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

827

# We assume that all normalization changes are in 1:1 mappings

828

assert " " not in value

829

normalization_changes.append((i, value))

830

elif k == 6:

831

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

832

# we only support changes where the old value is a single digit

833

assert value in "0123456789"

834

decimal_changes[i] = int(value)

835

elif k == 8:

836

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

837

# Since 0 encodes "no change", the old value is better not 0

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

838

if not value:

839

numeric_changes[i] = -1

840

else:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

841

numeric_changes[i] = float(value)

842

assert numeric_changes[i] not in (0, -1)

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

843

elif k == 9:

844

if value == 'Y':

845

mirrored_changes[i] = '1'

846

else:

847

mirrored_changes[i] = '0'

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

848

elif k == 11:

849

# change to ISO comment, ignore

850

pass

851

elif k == 12:

852

# change to simple uppercase mapping; ignore

853

pass

854

elif k == 13:

855

# change to simple lowercase mapping; ignore

856

pass

857

elif k == 14:

858

# change to simple titlecase mapping; ignore

859

pass

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

860

elif k == 16:

861

# derived property changes; not yet

862

pass

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

863

elif k == 17:

864

# normalization quickchecks are not performed

865

# for older versions

866

pass

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

867

else:

868

class Difference(Exception):pass

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

869

raise Difference(hex(i), k, old.table[i], new.table[i])

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

870

new.changed.append((version, list(zip(bidir_changes, category_changes,

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

871

decimal_changes, mirrored_changes,

872

numeric_changes)),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

873

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

874

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

875

def open_data(template, version):

876

local = template % ('-'+version,)

877

if not os.path.exists(local):

878

import urllib.request

879

if version == '3.2.0':

880

# irregular url structure

881

url = 'http://www.unicode.org/Public/3.2-Update/' + local

882

else:

883

url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')

884

urllib.request.urlretrieve(url, filename=local)

885

if local.endswith('.txt'):

886

return open(local, encoding='utf-8')

887

else:

888

# Unihan.zip

889

return open(local, 'rb')

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

890

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

891

# --------------------------------------------------------------------

892

# the following support code is taken from the unidb utilities

893

894

895

# load a unicode-data file from disk

896

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

897

class UnicodeData:

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

898

# Record structure:

899

# [ID, name, category, combining, bidi, decomp, (6)

900

# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)

901

# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)

902

# derived-props] (17)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

903

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

904

def __init__(self, version,

905

linebreakprops=False,

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

906

expand=1,

907

cjk_check=True):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

908

self.changed = []

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

909

table = [None] * 0x110000

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

910

with open_data(UNICODE_DATA, version) as file:

while 1:

s = file.readline()

if not s:

break

s = s.strip().split(";")

916

char = int(s[0], 16)

917

table[char] = s

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

918

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

919

cjk_ranges_found = []

920

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

921

# expand first-last ranges

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

922

if expand:

923

field = None

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

924

for i in range(0, 0x110000):

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

925

s = table[i]

926

if s:

927

if s[1][-6:] == "First>":

928

s[1] = ""

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

929

field = s

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

930

elif s[1][-5:] == "Last>":

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

931

if s[1].startswith("<CJK Ideograph"):

932

cjk_ranges_found.append((field[0],

933

s[0]))

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

934

s[1] = ""

935

field = None

936

elif field:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

937

f2 = field[:]

938

f2[0] = "%X" % i

939

table[i] = f2

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

940

if cjk_check and cjk_ranges != cjk_ranges_found:

941

raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

942

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

943

# public attributes

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

944

self.filename = UNICODE_DATA % ''

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

945

self.table = table

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

946

self.chars = list(range(0x110000)) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

947

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

948

# check for name aliases and named sequences, see #12753

949

# aliases and named sequences are not in 3.2.0

950

if version != '3.2.0':

951

self.aliases = []

952

# store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,

953

# in order to take advantage of the compression and lookup

954

# algorithms used for the other characters

955

pua_index = NAME_ALIASES_START

956

with open_data(NAME_ALIASES, version) as file:

957

for s in file:

958

s = s.strip()

959

if not s or s.startswith('#'):

960

continue

961

char, name = s.split(';')

962

char = int(char, 16)

963

self.aliases.append((name, char))

964

# also store the name in the PUA 1

965

self.table[pua_index][1] = name

966

pua_index += 1

967

assert pua_index - NAME_ALIASES_START == len(self.aliases)

968

969

self.named_sequences = []

970

# store named seqences in the PUA 1, in range U+F0100..,

971

# in order to take advantage of the compression and lookup

972

# algorithms used for the other characters.

973

974

pua_index = NAMED_SEQUENCES_START

975

with open_data(NAMED_SEQUENCES, version) as file:

976

for s in file:

977

s = s.strip()

978

if not s or s.startswith('#'):

979

continue

980

name, chars = s.split(';')

981

chars = tuple(int(char, 16) for char in chars.split())

982

# check that the structure defined in makeunicodename is OK

983

assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"

984

assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "

985

"the NamedSequence struct and in unicodedata_lookup")

986

self.named_sequences.append((name, chars))

987

# also store these in the PUA 1

988

self.table[pua_index][1] = name

989

pua_index += 1

990

assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)

991

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

992

self.exclusions = {}

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

993

with open_data(COMPOSITION_EXCLUSIONS, version) as file:

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

char = int(s.split()[0],16)

1001

self.exclusions[char] = 1

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

1002

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

1003

widths = [None] * 0x110000

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1004

with open_data(EASTASIAN_WIDTH, version) as file:

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

s = s.split()[0].split(';')

1012

if '..' in s[0]:

1013

first, last = [int(c, 16) for c in s[0].split('..')]

1014

chars = list(range(first, last+1))

1015

else:

1016

chars = [int(s[0], 16)]

for char in chars:

widths[char] = s[1]

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

1020

for i in range(0, 0x110000):

1021

if table[i] is not None:

1022

table[i].append(widths[i])

1023

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1024

for i in range(0, 0x110000):

1025

if table[i] is not None:

1026

table[i].append(set())

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1027

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1028

with open_data(DERIVED_CORE_PROPERTIES, version) as file:

1029

for s in file:

1030

s = s.split('#', 1)[0].strip()

1031

if not s:

1032

continue

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1033

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

r, p = s.split(";")

r = r.strip()

p = p.strip()

if ".." in r:

first, last = [int(c, 16) for c in r.split('..')]

1039

chars = list(range(first, last+1))

else:

chars = [int(r, 16)]

for char in chars:

if table[char]:

# Some properties (e.g. Default_Ignorable_Code_Point)

1045

# apply to unassigned code points; ignore them

1046

table[char][-1].add(p)

1047

1048

with open_data(LINE_BREAK, version) as file:

1049

for s in file:

1050

s = s.partition('#')[0]

1051

s = [i.strip() for i in s.split(';')]

1052

if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:

1053

continue

1054

if '..' not in s[0]:

1055

first = last = int(s[0], 16)

1056

else:

1057

first, last = [int(c, 16) for c in s[0].split('..')]

1058

for char in range(first, last+1):

1059

table[char][-1].add('Line_Break')

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

1060

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1061

# We only want the quickcheck properties

1062

# Format: NF?_QC; Y(es)/N(o)/M(aybe)

1063

# Yes is the default, hence only N and M occur

1064

# In 3.2.0, the format was different (NF?_NO)

1065

# The parsing will incorrectly determine these as

1066

# "yes", however, unicodedata.c will not perform quickchecks

1067

# for older versions, and no delta records will be created.

1068

quickchecks = [0] * 0x110000

1069

qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1070

with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:

for s in file:

if '#' in s:

s = s[:s.index('#')]

s = [i.strip() for i in s.split(';')]

1075

if len(s) < 2 or s[1] not in qc_order:

1076

continue

1077

quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No

1078

quickcheck_shift = qc_order.index(s[1])*2

1079

quickcheck <<= quickcheck_shift

1080

if '..' not in s[0]:

1081

first = last = int(s[0], 16)

1082

else:

1083

first, last = [int(c, 16) for c in s[0].split('..')]

1084

for char in range(first, last+1):

1085

assert not (quickchecks[char]>>quickcheck_shift)&3

1086

quickchecks[char] |= quickcheck

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1087

for i in range(0, 0x110000):

1088

if table[i] is not None:

1089

table[i].append(quickchecks[i])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

1090

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1091

with open_data(UNIHAN, version) as file:

1092

zip = zipfile.ZipFile(file)

1093

if version == '3.2.0':

1094

data = zip.open('Unihan-3.2.0.txt').read()

1095

else:

1096

data = zip.open('Unihan_NumericValues.txt').read()

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1097

for line in data.decode("utf-8").splitlines():

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

1098

if not line.startswith('U+'):

1099

continue

1100

code, tag, value = line.split(None, 3)[:3]

1101

if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',

1102

'kOtherNumeric'):

1103

continue

1104

value = value.strip().replace(',', '')

1105

i = int(code[2:], 16)

1106

# Patch the numeric field

1107

if table[i] is not None:

1108

table[i][8] = value

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

1109

sc = self.special_casing = {}

1110

with open_data(SPECIAL_CASING, version) as file:

1111

for s in file:

1112

s = s[:-1].split('#', 1)[0]

if not s:

continue

data = s.split("; ")

if data[4]:

# We ignore all conditionals (since they depend on

1118

# languages) except for one, which is hardcoded. See

1119

# handle_capital_sigma in unicodeobject.c.

1120

continue

1121

c = int(data[0], 16)

1122

lower = [int(char, 16) for char in data[1].split()]

1123

title = [int(char, 16) for char in data[2].split()]

1124

upper = [int(char, 16) for char in data[3].split()]

1125

sc[c] = (lower, title, upper)

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

1126

cf = self.case_folding = {}

1127

if version != '3.2.0':

1128

with open_data(CASE_FOLDING, version) as file:

1129

for s in file:

1130

s = s[:-1].split('#', 1)[0]

if not s:

continue

data = s.split("; ")

if data[1] in "CF":

c = int(data[0], 16)

cf[c] = [int(char, 16) for char in data[2].split()]

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

1137

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1138

def uselatin1(self):

1139

# restrict character range to ISO Latin 1

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

1140

self.chars = list(range(256))

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1141

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1142

# hash table tools

1143

1144

# this is a straight-forward reimplementation of Python's built-in

1145

# dictionary type, using a static data structure, and a custom string

1146

# hash algorithm.

1147

1148

def myhash(s, magic):

1149

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1150

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1151

h = (h * magic) + c

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

1152

ix = h & 0xff000000

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1153

if ix:

1154

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

return h

SIZES = [

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

1159

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

1160

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

1161

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

]

class Hash:

def __init__(self, name, data, magic):

1166

# turn a (key, value) list into a static hash table structure

1167

1168

# determine table size

1169

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

Ezio Melotti

1392500

2011-03-16 11:05:33 +0200

[diff] [blame]

1174

raise AssertionError("ran out of polynomials")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1175

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1176

print(size, "slots in hash table")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1177

1178

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

1187

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

incr = (h ^ (h >> 3)) & mask;

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1208

print(n, "collisions")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1209

self.collisions = n

1210

1211

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1221

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1222

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1223

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1224

file.write("#define %s_magic %d\n" % (self.name, self.magic))

1225

file.write("#define %s_size %d\n" % (self.name, self.size))

1226

file.write("#define %s_poly %d\n" % (self.name, self.poly))

1227

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1228

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1236

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1237

# write data to file, as a C array

1238

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1239

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1240

print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1241

file.write("static ")

1242

if size == 1:

1243

file.write("unsigned char")

1244

elif size == 2:

1245

file.write("unsigned short")

1246

else:

1247

file.write("unsigned int")

1248

file.write(" " + self.name + "[] = {\n")

1249

if self.data:

1250

s = " "

1251

for item in self.data:

1252

i = str(item) + ", "

1253

if len(s) + len(i) > 78:

file.write(s + "\n")

s = " " + i

else:

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1258

if s.strip():

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

file.write(s + "\n")

file.write("};\n\n")

def getsize(data):

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1272

def splitbins(t, trace=0):

1273

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

1274

1275

t is a sequence of ints. This function can be useful to save space if

1276

many of the ints are the same. t1 and t2 are lists of ints, and shift

1277

is an int, chosen to minimize the combined size of t1 and t2 (in C

1278

code), and where for each i in range(len(t)),

1279

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1280

where mask is a bitmask isolating the last "shift" bits.

1281

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1282

If optional arg trace is non-zero (default zero), progress info

1283

is printed to sys.stderr. The higher the value, the more info

1284

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1285

"""

1286

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1287

if trace:

1288

def dump(t1, t2, shift, bytes):

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1289

print("%d+%d bins at shift %d; %d bytes" % (

1290

len(t1), len(t2), shift, bytes), file=sys.stderr)

1291

print("Size of original table:", len(t)*getsize(t), \

1292

"bytes", file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1293

n = len(t)-1 # last valid index

1294

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

Christian Heimes

a37d4c6

2007-12-04 23:02:19 +0000

[diff] [blame]

1300

bytes = sys.maxsize # smallest total size so far

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1301

t = tuple(t) # so slices can be dict keys

1302

for shift in range(maxshift + 1):

1303

t1 = []

1304

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1305

size = 2**shift

1306

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1307

for i in range(0, len(t), size):

1308

bin = t[i:i+size]

1309

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1310

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1311

index = len(t2)

1312

bincache[bin] = index

1313

t2.extend(bin)

1314

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1315

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1316

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1317

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1318

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1319

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1320

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1321

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1322

t1, t2, shift = best

1323

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1324

print("Best:", end=' ', file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1325

dump(t1, t2, shift, bytes)

1326

if __debug__:

1327

# exhaustively verify that the decomposition is correct

1328

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

Guido van Rossum

805365e

2007-05-07 22:24:25 +0000

[diff] [blame]

1329

for i in range(len(t)):

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1330

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1331

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1332

1333

if __name__ == "__main__":

Fredrik Lundh