Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython3

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

4

# this script converts a unicode 3.2 database file to

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

5

# Modules/unicodedata_db.h, Modules/unicodename_db.h,

6

# and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

7

#

8

# history:

9

# 2000-09-24 fl created (based on bits and pieces from unidb)

10

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

11

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

12

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

13

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

14

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

15

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

16

# 2002-09-11 wd use string methods

17

# 2002-10-18 mvl update to Unicode 3.2

18

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

19

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

20

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

21

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

22

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

23

# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

24

# 2011-10-21 ezio add support for name aliases and named sequences

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

25

# 2012-01 benjamin add full case mappings

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

26

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

27

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

28

#

29

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

import os

import sys

import zipfile

from textwrap import dedent

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

35

36

SCRIPT = sys.argv[0]

Amaury Forgeot d'Arc

324ac65

2010-08-18 20:44:58 +0000

[diff] [blame]

37

VERSION = "3.2"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

38

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

39

# The Unicode Database

R David Murray

7445a38

2014-10-09 17:30:33 -0400

[diff] [blame]

40

# --------------------

41

# When changing UCD version please update

42

# * Doc/library/stdtypes.rst, and

43

# * Doc/library/unicodedata.rst

R David Murray

5f16f90

2014-10-09 20:45:59 -0400

[diff] [blame]

44

# * Doc/reference/lexical_analysis.rst (two occurrences)

Benjamin Peterson

3032ed7

2014-07-06 13:04:20 -0700

[diff] [blame]

45

UNIDATA_VERSION = "7.0.0"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

46

UNICODE_DATA = "UnicodeData%s.txt"

47

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

48

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

49

UNIHAN = "Unihan%s.zip"

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

50

DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

51

DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

52

LINE_BREAK = "LineBreak%s.txt"

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

53

NAME_ALIASES = "NameAliases%s.txt"

54

NAMED_SEQUENCES = "NamedSequences%s.txt"

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

55

SPECIAL_CASING = "SpecialCasing%s.txt"

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

56

CASE_FOLDING = "CaseFolding%s.txt"

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

57

58

# Private Use Areas -- in planes 1, 15, 16

59

PUA_1 = range(0xE000, 0xF900)

60

PUA_15 = range(0xF0000, 0xFFFFE)

61

PUA_16 = range(0x100000, 0x10FFFE)

62

63

# we use this ranges of PUA_15 to store name aliases and named sequences

64

NAME_ALIASES_START = 0xF0000

Benjamin Peterson

2012-02-20 22:24:29 -0500

[diff] [blame]

65

NAMED_SEQUENCES_START = 0xF0200

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

66

67

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

68

69

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

70

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

71

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

72

"So" ]

73

74

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

75

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

Benjamin Peterson

94d08d9

2013-10-10 17:24:45 -0400

[diff] [blame]

76

"ON", "LRI", "RLI", "FSI", "PDI" ]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

77

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

78

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

79

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

80

MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]

81

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

82

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

87

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

88

SPACE_MASK = 0x20

89

TITLE_MASK = 0x40

90

UPPER_MASK = 0x80

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

91

XID_START_MASK = 0x100

92

XID_CONTINUE_MASK = 0x200

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

93

PRINTABLE_MASK = 0x400

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

94

NUMERIC_MASK = 0x800

95

CASE_IGNORABLE_MASK = 0x1000

96

CASED_MASK = 0x2000

97

EXTENDED_CASE_MASK = 0x4000

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

98

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

99

# these ranges need to match unicodedata.c:is_unified_ideograph

100

cjk_ranges = [

101

('3400', '4DB5'),

Benjamin Peterson

2012-02-20 22:24:29 -0500

[diff] [blame]

102

('4E00', '9FCC'),

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

('20000', '2A6D6'),

('2A700', '2B734'),

('2B740', '2B81D')

]

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

108

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

109

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

110

print("--- Reading", UNICODE_DATA % "", "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

111

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

112

version = ""

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

113

unicode = UnicodeData(UNIDATA_VERSION)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

114

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

115

print(len(list(filter(None, unicode.table))), "characters")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

116

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

117

for version in old_versions:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

118

print("--- Reading", UNICODE_DATA % ("-"+version), "...")

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

119

old_unicode = UnicodeData(version, cjk_check=False)

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

120

print(len(list(filter(None, old_unicode.table))), "characters")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

121

merge_old_version(version, unicode, old_unicode)

122

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

123

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

124

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

125

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

126

127

# --------------------------------------------------------------------

128

# unicode character properties

129

130

def makeunicodedata(unicode, trace):

131

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

132

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

133

table = [dummy]

134

cache = {0: dummy}

135

index = [0] * len(unicode.chars)

136

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

137

FILE = "Modules/unicodedata_db.h"

138

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

139

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

140

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

141

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

142

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

143

for char in unicode.chars:

144

record = unicode.table[char]

145

if record:

146

# extract database properties

147

category = CATEGORY_NAMES.index(record[2])

148

combining = int(record[3])

149

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

150

mirrored = record[9] == "Y"

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

151

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

152

normalizationquickcheck = record[17]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

153

item = (

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

154

category, combining, bidirectional, mirrored, eastasianwidth,

155

normalizationquickcheck

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

156

)

157

# add entry to index and item tables

158

i = cache.get(item)

159

if i is None:

160

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

164

# 2) decomposition data

165

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

166

decomp_data = [0]

167

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

168

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

169

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

170

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

171

comp_pairs = []

172

comp_first = [None] * len(unicode.chars)

173

comp_last = [None] * len(unicode.chars)

174

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

175

for char in unicode.chars:

176

record = unicode.table[char]

177

if record:

178

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

179

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

180

if len(decomp) > 19:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

181

raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

182

# prefix

183

if decomp[0][0] == "<":

184

prefix = decomp.pop(0)

185

else:

186

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

187

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

188

i = decomp_prefix.index(prefix)

189

except ValueError:

190

i = len(decomp_prefix)

191

decomp_prefix.append(prefix)

192

prefix = i

193

assert prefix < 256

194

# content

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

195

decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

196

# Collect NFC pairs

197

if not prefix and len(decomp) == 3 and \

198

char not in unicode.exclusions and \

199

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

204

try:

205

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

206

except ValueError:

207

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

208

decomp_data.extend(decomp)

209

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

210

else:

211

i = 0

212

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

213

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

214

f = l = 0

215

comp_first_ranges = []

216

comp_last_ranges = []

217

prev_f = prev_l = None

218

for i in unicode.chars:

219

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

225

prev_f = prev_f[0],i

226

else:

227

comp_first_ranges.append(prev_f)

228

prev_f = (i,i)

229

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

235

prev_l = prev_l[0],i

236

else:

237

comp_last_ranges.append(prev_l)

238

prev_l = (i,i)

239

comp_first_ranges.append(prev_f)

240

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

245

for f,l,char in comp_pairs:

246

f = comp_first[f]

247

l = comp_last[l]

248

comp_data[f*total_last+l] = char

249

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

250

print(len(table), "unique properties")

251

print(len(decomp_prefix), "unique decomposition prefixes")

252

print(len(decomp_data), "unique decomposition entries:", end=' ')

253

print(decomp_size, "bytes")

254

print(total_first, "first characters in NFC")

255

print(total_last, "last characters in NFC")

256

print(len(comp_pairs), "NFC pairs")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

257

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

258

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

259

Fred Drake

9c68505

2000-10-26 03:56:46 +0000

[diff] [blame]

260

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

261

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

262

print(file=fp)

263

print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)

264

print("/* a list of unique database records */", file=fp)

265

print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

266

for item in table:

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

267

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

268

print("};", file=fp)

269

print(file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

270

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

271

print("/* Reindexing of NFC first characters. */", file=fp)

272

print("#define TOTAL_FIRST",total_first, file=fp)

273

print("#define TOTAL_LAST",total_last, file=fp)

274

print("struct reindex{int start;short count,index;};", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

275

print("static struct reindex nfc_first[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

276

for start,end in comp_first_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

277

print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)

278

print(" {0,0,0}", file=fp)

279

print("};\n", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

280

print("static struct reindex nfc_last[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

281

for start,end in comp_last_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

282

print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)

283

print(" {0,0,0}", file=fp)

284

print("};\n", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

285

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

286

# FIXME: <fl> the following tables could be made static, and

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

287

# the support code moved into unicodedatabase.c

288

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

289

print("/* string literals */", file=fp)

290

print("const char *_PyUnicode_CategoryNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

291

for name in CATEGORY_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

292

print(" \"%s\"," % name, file=fp)

293

print(" NULL", file=fp)

294

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

295

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

296

print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

297

for name in BIDIRECTIONAL_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

298

print(" \"%s\"," % name, file=fp)

299

print(" NULL", file=fp)

300

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

301

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

302

print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

303

for name in EASTASIANWIDTH_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

304

print(" \"%s\"," % name, file=fp)

305

print(" NULL", file=fp)

306

print("};", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

307

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

308

print("static const char *decomp_prefix[] = {", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

309

for name in decomp_prefix:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

310

print(" \"%s\"," % name, file=fp)

311

print(" NULL", file=fp)

312

print("};", file=fp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

313

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

314

# split record index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

315

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

316

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

317

print("/* index tables for the database records */", file=fp)

318

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

319

Array("index1", index1).dump(fp, trace)

320

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

321

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

322

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

323

index1, index2, shift = splitbins(decomp_index, trace)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

324

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

325

print("/* decomposition data */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

326

Array("decomp_data", decomp_data).dump(fp, trace)

327

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

328

print("/* index tables for the decomposition data */", file=fp)

329

print("#define DECOMP_SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

330

Array("decomp_index1", index1).dump(fp, trace)

331

Array("decomp_index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

332

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

333

index, index2, shift = splitbins(comp_data, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

334

print("/* NFC pairs */", file=fp)

335

print("#define COMP_SHIFT", shift, file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

336

Array("comp_index", index).dump(fp, trace)

337

Array("comp_data", index2).dump(fp, trace)

338

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

339

# Generate delta tables for old versions

340

for version, table, normalization in unicode.changed:

341

cversion = version.replace(".","_")

342

records = [table[0]]

343

cache = {table[0]:0}

344

index = [0] * len(table)

345

for i, record in enumerate(table):

346

try:

347

index[i] = cache[record]

348

except KeyError:

349

index[i] = cache[record] = len(records)

350

records.append(record)

351

index1, index2, shift = splitbins(index, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

352

print("static const change_record change_records_%s[] = {" % cversion, file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

353

for record in records:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

354

print("\t{ %s }," % ", ".join(map(str,record)), file=fp)

355

print("};", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

356

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

357

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

358

print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)

359

print("{", file=fp)

360

print("\tint index;", file=fp)

361

print("\tif (n >= 0x110000) index = 0;", file=fp)

362

print("\telse {", file=fp)

363

print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)

364

print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \

365

(cversion, shift, ((1<<shift)-1)), file=fp)

366

print("\t}", file=fp)

367

print("\treturn change_records_%s+index;" % cversion, file=fp)

368

print("}\n", file=fp)

369

print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)

370

print("{", file=fp)

371

print("\tswitch(n) {", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

372

for k, v in normalization:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

373

print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)

374

print("\tdefault: return 0;", file=fp)

375

print("\t}\n}\n", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

376

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

377

fp.close()

378

379

# --------------------------------------------------------------------

380

# unicode character type tables

381

382

def makeunicodetype(unicode, trace):

383

384

FILE = "Objects/unicodetype_db.h"

385

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

386

print("--- Preparing", FILE, "...")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

387

388

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

389

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

390

table = [dummy]

391

cache = {0: dummy}

392

index = [0] * len(unicode.chars)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

393

numeric = {}

394

spaces = []

395

linebreaks = []

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

396

extra_casing = []

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

397

398

for char in unicode.chars:

399

record = unicode.table[char]

400

if record:

401

# extract database properties

402

category = record[2]

403

bidirectional = record[4]

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

404

properties = record[16]

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

405

flags = 0

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

406

delta = True

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

407

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

408

flags |= ALPHA_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

409

if "Lowercase" in properties:

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

410

flags |= LOWER_MASK

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

411

if 'Line_Break' in properties or bidirectional == "B":

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

412

flags |= LINEBREAK_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

413

linebreaks.append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

414

if category == "Zs" or bidirectional in ("WS", "B", "S"):

415

flags |= SPACE_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

416

spaces.append(char)

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

417

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

418

flags |= TITLE_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

419

if "Uppercase" in properties:

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

420

flags |= UPPER_MASK

Benjamin Peterson

0983274

2009-03-26 17:15:46 +0000

[diff] [blame]

421

if char == ord(" ") or category[0] not in ("C", "Z"):

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

422

flags |= PRINTABLE_MASK

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

423

if "XID_Start" in properties:

424

flags |= XID_START_MASK

425

if "XID_Continue" in properties:

426

flags |= XID_CONTINUE_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

427

if "Cased" in properties:

428

flags |= CASED_MASK

429

if "Case_Ignorable" in properties:

430

flags |= CASE_IGNORABLE_MASK

431

sc = unicode.special_casing.get(char)

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

432

cf = unicode.case_folding.get(char, [char])

433

if record[12]:

434

upper = int(record[12], 16)

else:

upper = char

if record[13]:

lower = int(record[13], 16)

else:

lower = char

if record[14]:

title = int(record[14], 16)

443

else:

444

title = upper

445

if sc is None and cf != [lower]:

446

sc = ([lower], [title], [upper])

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

447

if sc is None:

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

448

if upper == lower == title:

449

upper = lower = title = 0

Benjamin Peterson

ad9c569

2012-01-15 21:19:20 -0500

[diff] [blame]

else:

upper = upper - char

lower = lower - char

title = title - char

assert (abs(upper) <= 2147483647 and

455

abs(lower) <= 2147483647 and

456

abs(title) <= 2147483647)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

457

else:

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

458

# This happens either when some character maps to more than one

459

# character in uppercase, lowercase, or titlecase or the

460

# casefolded version of the character is different from the

461

# lowercase. The extra characters are stored in a different

462

# array.

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

463

flags |= EXTENDED_CASE_MASK

464

lower = len(extra_casing) | (len(sc[0]) << 24)

465

extra_casing.extend(sc[0])

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

466

if cf != sc[0]:

467

lower |= len(cf) << 20

468

extra_casing.extend(cf)

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

469

upper = len(extra_casing) | (len(sc[2]) << 24)

470

extra_casing.extend(sc[2])

471

# Title is probably equal to upper.

if sc[1] == sc[2]:

title = upper

else:

title = len(extra_casing) | (len(sc[1]) << 24)

476

extra_casing.extend(sc[1])

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

477

# decimal digit, integer digit

478

decimal = 0

479

if record[6]:

480

flags |= DECIMAL_MASK

481

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

486

if record[8]:

487

flags |= NUMERIC_MASK

488

numeric.setdefault(record[8], []).append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

489

item = (

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

490

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

491

)

492

# add entry to index and item tables

493

i = cache.get(item)

494

if i is None:

495

cache[item] = i = len(table)

table.append(item)

index[char] = i

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

499

print(len(table), "unique character type entries")

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

500

print(sum(map(len, numeric.values())), "numeric code points")

501

print(len(spaces), "whitespace code points")

502

print(len(linebreaks), "linebreak code points")

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

503

print(len(extra_casing), "extended case array")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

504

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

505

print("--- Writing", FILE, "...")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

506

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

507

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

508

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

509

print(file=fp)

510

print("/* a list of unique character type descriptors */", file=fp)

511

print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

512

for item in table:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

513

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

514

print("};", file=fp)

515

print(file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

516

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

517

print("/* extended case mappings */", file=fp)

518

print(file=fp)

519

print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)

520

for c in extra_casing:

521

print(" %d," % c, file=fp)

print("};", file=fp)

print(file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

525

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

526

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

527

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

528

print("/* type indexes */", file=fp)

529

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

530

Array("index1", index1).dump(fp, trace)

531

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

532

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

533

# Generate code for _PyUnicode_ToNumeric()

534

numeric_items = sorted(numeric.items())

535

print('/* Returns the numeric value as double for Unicode characters', file=fp)

536

print(' * having this property, -1.0 otherwise.', file=fp)

537

print(' */', file=fp)

Amaury Forgeot d'Arc

324ac65

2010-08-18 20:44:58 +0000

[diff] [blame]

538

print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

539

print('{', file=fp)

540

print(' switch (ch) {', file=fp)

541

for value, codepoints in numeric_items:

Amaury Forgeot d'Arc

919765a

2009-10-13 23:18:53 +0000

[diff] [blame]

542

# Turn text into float literals

543

parts = value.split('/')

544

parts = [repr(float(part)) for part in parts]

545

value = '/'.join(parts)

546

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

547

codepoints.sort()

548

for codepoint in codepoints:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

549

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

550

print(' return (double) %s;' % (value,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

551

print(' }', file=fp)

552

print(' return -1.0;', file=fp)

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsWhitespace()

557

print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)

558

print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)

559

print(" */", file=fp)

Antoine Pitrou

9ed5f27

2013-08-13 20:18:52 +0200

[diff] [blame]

560

print('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

561

print('{', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

562

print(' switch (ch) {', file=fp)

563

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

564

for codepoint in sorted(spaces):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

565

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

566

print(' return 1;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

567

568

print(' }', file=fp)

569

print(' return 0;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsLinebreak()

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

574

print("/* Returns 1 for Unicode characters having the line break", file=fp)

575

print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)

576

print(" * type 'B', 0 otherwise.", file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

577

print(" */", file=fp)

Antoine Pitrou

9ed5f27

2013-08-13 20:18:52 +0200

[diff] [blame]

578

print('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

579

print('{', file=fp)

580

print(' switch (ch) {', file=fp)

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

581

for codepoint in sorted(linebreaks):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

582

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

583

print(' return 1;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

584

585

print(' }', file=fp)

586

print(' return 0;', file=fp)

print('}', file=fp)

print(file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

590

fp.close()

591

592

# --------------------------------------------------------------------

593

# unicode name database

594

595

def makeunicodename(unicode, trace):

596

597

FILE = "Modules/unicodename_db.h"

598

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

599

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

600

601

# collect names

602

names = [None] * len(unicode.chars)

603

604

for char in unicode.chars:

605

record = unicode.table[char]

606

if record:

607

name = record[1].strip()

608

if name and name[0] != "<":

609

names[char] = name + chr(0)

610

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

611

print(len(list(n for n in names if n is not None)), "distinct names")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

612

613

# collect unique words from names (note that we differ between

614

# words inside a sentence, and words ending a sentence. the

615

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

631

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

632

print(n, "words in text;", b, "bytes")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

633

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

634

wordlist = list(words.items())

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

635

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

636

# sort on falling frequency, then by name

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

637

def word_key(a):

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

638

aword, alist = a

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

639

return -len(alist), aword

640

wordlist.sort(key=word_key)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

641

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

642

# figure out how many phrasebook escapes we need

643

escapes = 0

644

while escapes * 256 < len(wordlist):

645

escapes = escapes + 1

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

646

print(escapes, "escapes")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

647

648

short = 256 - escapes

assert short > 0

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

652

print(short, "short indexes in lexicon")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

653

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

654

# statistics

655

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

656

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

657

n = n + len(wordlist[i][1])

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

658

print(n, "short indexes in phrasebook")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

659

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

660

# pick the most commonly used words, and sort the rest on falling

661

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

662

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

663

wordlist, wordtail = wordlist[:short], wordlist[short:]

Raymond Hettinger

d4cb56d

2008-01-30 02:55:10 +0000

[diff] [blame]

664

wordtail.sort(key=lambda a: a[0], reverse=True)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

665

wordlist.extend(wordtail)

666

667

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

674

offset = 0

675

for w, x in wordlist:

676

# encoding: bit 7 indicates last character in word (chr(128)

677

# indicates the last character in an entire string)

678

ww = w[:-1] + chr(ord(w[-1])+128)

679

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

680

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

681

if o < 0:

682

o = offset

683

lexicon = lexicon + ww

684

offset = offset + len(w)

685

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

686

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

687

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

688

lexicon = list(map(ord, lexicon))

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

689

690

# generate phrasebook from names and lexicon

691

phrasebook = [0]

692

phrasebook_offset = [0] * len(unicode.chars)

693

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

698

for w in w:

699

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

700

if i < short:

701

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

702

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

703

# store as two bytes

704

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

705

phrasebook.append(i&255)

706

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

707

assert getsize(phrasebook) == 1

708

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

709

#

710

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

715

record = unicode.table[char]

716

if record:

717

name = record[1].strip()

718

if name and name[0] != "<":

719

data.append((name, char))

720

721

# the magic number 47 was chosen to minimize the number of

722

# collisions on the current data set. if you like, change it

723

# and see what happens...

724

725

codehash = Hash("code", data, 47)

726

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

727

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

728

729

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

730

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

731

print(file=fp)

732

print("#define NAME_MAXLEN", 256, file=fp)

733

print(file=fp)

734

print("/* lexicon */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

735

Array("lexicon", lexicon).dump(fp, trace)

736

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

737

738

# split decomposition index table

739

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

740

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

741

print("/* code->name phrasebook */", file=fp)

742

print("#define phrasebook_shift", shift, file=fp)

743

print("#define phrasebook_short", short, file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

744

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

745

Array("phrasebook", phrasebook).dump(fp, trace)

746

Array("phrasebook_offset1", offset1).dump(fp, trace)

747

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

748

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

749

print("/* name->code dictionary */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

750

codehash.dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

751

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

752

print(file=fp)

753

print('static const unsigned int aliases_start = %#x;' %

754

NAME_ALIASES_START, file=fp)

755

print('static const unsigned int aliases_end = %#x;' %

756

(NAME_ALIASES_START + len(unicode.aliases)), file=fp)

757

758

print('static const unsigned int name_aliases[] = {', file=fp)

759

for name, codepoint in unicode.aliases:

760

print(' 0x%04X,' % codepoint, file=fp)

761

print('};', file=fp)

762

763

# In Unicode 6.0.0, the sequences contain at most 4 BMP chars,

764

# so we are using Py_UCS2 seq[4]. This needs to be updated if longer

765

# sequences or sequences with non-BMP chars are added.

766

# unicodedata_lookup should be adapted too.

767

print(dedent("""

768

typedef struct NamedSequence {

int seqlen;

Py_UCS2 seq[4];

} named_sequence;

"""), file=fp)

print('static const unsigned int named_sequences_start = %#x;' %

775

NAMED_SEQUENCES_START, file=fp)

776

print('static const unsigned int named_sequences_end = %#x;' %

777

(NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)

778

779

print('static const named_sequence named_sequences[] = {', file=fp)

780

for name, sequence in unicode.named_sequences:

781

seq_str = ', '.join('0x%04X' % cp for cp in sequence)

782

print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)

783

print('};', file=fp)

784

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

785

fp.close()

786

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

787

788

def merge_old_version(version, new, old):

789

# Changes to exclusion file not implemented yet

790

if old.exclusions != new.exclusions:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

791

raise NotImplementedError("exclusions differ")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

792

793

# In these change records, 0xFF means "no change"

794

bidir_changes = [0xFF]*0x110000

795

category_changes = [0xFF]*0x110000

796

decimal_changes = [0xFF]*0x110000

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

797

mirrored_changes = [0xFF]*0x110000

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

798

# In numeric data, 0 means "no change",

799

# -1 means "did not have a numeric value

800

numeric_changes = [0] * 0x110000

801

# normalization_changes is a list of key-value pairs

802

normalization_changes = []

803

for i in range(0x110000):

804

if new.table[i] is None:

805

# Characters unassigned in the new version ought to

806

# be unassigned in the old one

807

assert old.table[i] is None

808

continue

809

# check characters unassigned in the old version

810

if old.table[i] is None:

811

# category 0 is "unassigned"

812

category_changes[i] = 0

813

continue

814

# check characters that differ

815

if old.table[i] != new.table[i]:

816

for k in range(len(old.table[i])):

817

if old.table[i][k] != new.table[i][k]:

818

value = old.table[i][k]

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

819

if k == 1 and i in PUA_15:

820

# the name is not set in the old.table, but in the

821

# new.table we are using it for aliases and named seq

822

assert value == ''

823

elif k == 2:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

824

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

825

category_changes[i] = CATEGORY_NAMES.index(value)

826

elif k == 4:

827

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

828

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

829

elif k == 5:

830

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

831

# We assume that all normalization changes are in 1:1 mappings

832

assert " " not in value

833

normalization_changes.append((i, value))

834

elif k == 6:

835

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

836

# we only support changes where the old value is a single digit

837

assert value in "0123456789"

838

decimal_changes[i] = int(value)

839

elif k == 8:

840

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

841

# Since 0 encodes "no change", the old value is better not 0

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

842

if not value:

843

numeric_changes[i] = -1

844

else:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

845

numeric_changes[i] = float(value)

846

assert numeric_changes[i] not in (0, -1)

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

847

elif k == 9:

848

if value == 'Y':

849

mirrored_changes[i] = '1'

850

else:

851

mirrored_changes[i] = '0'

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

852

elif k == 11:

853

# change to ISO comment, ignore

854

pass

855

elif k == 12:

856

# change to simple uppercase mapping; ignore

857

pass

858

elif k == 13:

859

# change to simple lowercase mapping; ignore

860

pass

861

elif k == 14:

862

# change to simple titlecase mapping; ignore

863

pass

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

864

elif k == 16:

865

# derived property changes; not yet

866

pass

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

867

elif k == 17:

868

# normalization quickchecks are not performed

869

# for older versions

870

pass

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

871

else:

872

class Difference(Exception):pass

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

873

raise Difference(hex(i), k, old.table[i], new.table[i])

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

874

new.changed.append((version, list(zip(bidir_changes, category_changes,

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

875

decimal_changes, mirrored_changes,

876

numeric_changes)),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

877

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

878

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

879

def open_data(template, version):

880

local = template % ('-'+version,)

881

if not os.path.exists(local):

882

import urllib.request

883

if version == '3.2.0':

884

# irregular url structure

885

url = 'http://www.unicode.org/Public/3.2-Update/' + local

886

else:

887

url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')

888

urllib.request.urlretrieve(url, filename=local)

889

if local.endswith('.txt'):

890

return open(local, encoding='utf-8')

891

else:

892

# Unihan.zip

893

return open(local, 'rb')

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

894

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

895

# --------------------------------------------------------------------

896

# the following support code is taken from the unidb utilities

897

898

899

# load a unicode-data file from disk

900

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

901

class UnicodeData:

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

902

# Record structure:

903

# [ID, name, category, combining, bidi, decomp, (6)

904

# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)

905

# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)

906

# derived-props] (17)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

907

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

908

def __init__(self, version,

909

linebreakprops=False,

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

910

expand=1,

911

cjk_check=True):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

912

self.changed = []

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

913

table = [None] * 0x110000

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

914

with open_data(UNICODE_DATA, version) as file:

while 1:

s = file.readline()

if not s:

break

s = s.strip().split(";")

920

char = int(s[0], 16)

921

table[char] = s

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

922

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

923

cjk_ranges_found = []

924

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

925

# expand first-last ranges

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

926

if expand:

927

field = None

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

928

for i in range(0, 0x110000):

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

929

s = table[i]

930

if s:

931

if s[1][-6:] == "First>":

932

s[1] = ""

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

933

field = s

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

934

elif s[1][-5:] == "Last>":

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

935

if s[1].startswith("<CJK Ideograph"):

936

cjk_ranges_found.append((field[0],

937

s[0]))

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

938

s[1] = ""

939

field = None

940

elif field:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

941

f2 = field[:]

942

f2[0] = "%X" % i

943

table[i] = f2

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

944

if cjk_check and cjk_ranges != cjk_ranges_found:

945

raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

946

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

947

# public attributes

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

948

self.filename = UNICODE_DATA % ''

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

949

self.table = table

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

950

self.chars = list(range(0x110000)) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

951

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

952

# check for name aliases and named sequences, see #12753

953

# aliases and named sequences are not in 3.2.0

954

if version != '3.2.0':

955

self.aliases = []

956

# store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,

957

# in order to take advantage of the compression and lookup

958

# algorithms used for the other characters

959

pua_index = NAME_ALIASES_START

960

with open_data(NAME_ALIASES, version) as file:

961

for s in file:

962

s = s.strip()

963

if not s or s.startswith('#'):

964

continue

Benjamin Peterson

2012-02-20 22:24:29 -0500

[diff] [blame]

965

char, name, abbrev = s.split(';')

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

966

char = int(char, 16)

967

self.aliases.append((name, char))

968

# also store the name in the PUA 1

969

self.table[pua_index][1] = name

970

pua_index += 1

971

assert pua_index - NAME_ALIASES_START == len(self.aliases)

972

973

self.named_sequences = []

Ezio Melotti

7c4a7e6

2013-08-26 01:32:56 +0300

[diff] [blame]

974

# store named sequences in the PUA 1, in range U+F0100..,

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

975

# in order to take advantage of the compression and lookup

976

# algorithms used for the other characters.

977

Benjamin Peterson

2012-02-20 22:24:29 -0500

[diff] [blame]

978

assert pua_index < NAMED_SEQUENCES_START

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

979

pua_index = NAMED_SEQUENCES_START

980

with open_data(NAMED_SEQUENCES, version) as file:

981

for s in file:

982

s = s.strip()

983

if not s or s.startswith('#'):

984

continue

985

name, chars = s.split(';')

986

chars = tuple(int(char, 16) for char in chars.split())

987

# check that the structure defined in makeunicodename is OK

988

assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"

989

assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "

990

"the NamedSequence struct and in unicodedata_lookup")

991

self.named_sequences.append((name, chars))

992

# also store these in the PUA 1

993

self.table[pua_index][1] = name

994

pua_index += 1

995

assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)

996

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

997

self.exclusions = {}

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

998

with open_data(COMPOSITION_EXCLUSIONS, version) as file:

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

char = int(s.split()[0],16)

1006

self.exclusions[char] = 1

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

1007

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

1008

widths = [None] * 0x110000

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1009

with open_data(EASTASIAN_WIDTH, version) as file:

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

s = s.split()[0].split(';')

1017

if '..' in s[0]:

1018

first, last = [int(c, 16) for c in s[0].split('..')]

1019

chars = list(range(first, last+1))

1020

else:

1021

chars = [int(s[0], 16)]

for char in chars:

widths[char] = s[1]

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

1025

for i in range(0, 0x110000):

1026

if table[i] is not None:

1027

table[i].append(widths[i])

1028

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1029

for i in range(0, 0x110000):

1030

if table[i] is not None:

1031

table[i].append(set())

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1032

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1033

with open_data(DERIVED_CORE_PROPERTIES, version) as file:

1034

for s in file:

1035

s = s.split('#', 1)[0].strip()

1036

if not s:

1037

continue

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1038

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

r, p = s.split(";")

r = r.strip()

p = p.strip()

if ".." in r:

first, last = [int(c, 16) for c in r.split('..')]

1044

chars = list(range(first, last+1))

else:

chars = [int(r, 16)]

for char in chars:

if table[char]:

# Some properties (e.g. Default_Ignorable_Code_Point)

1050

# apply to unassigned code points; ignore them

1051

table[char][-1].add(p)

1052

1053

with open_data(LINE_BREAK, version) as file:

1054

for s in file:

1055

s = s.partition('#')[0]

1056

s = [i.strip() for i in s.split(';')]

1057

if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:

1058

continue

1059

if '..' not in s[0]:

1060

first = last = int(s[0], 16)

1061

else:

1062

first, last = [int(c, 16) for c in s[0].split('..')]

1063

for char in range(first, last+1):

1064

table[char][-1].add('Line_Break')

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

1065

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1066

# We only want the quickcheck properties

1067

# Format: NF?_QC; Y(es)/N(o)/M(aybe)

1068

# Yes is the default, hence only N and M occur

1069

# In 3.2.0, the format was different (NF?_NO)

1070

# The parsing will incorrectly determine these as

1071

# "yes", however, unicodedata.c will not perform quickchecks

1072

# for older versions, and no delta records will be created.

1073

quickchecks = [0] * 0x110000

1074

qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1075

with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:

for s in file:

if '#' in s:

s = s[:s.index('#')]

s = [i.strip() for i in s.split(';')]

1080

if len(s) < 2 or s[1] not in qc_order:

1081

continue

1082

quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No

1083

quickcheck_shift = qc_order.index(s[1])*2

1084

quickcheck <<= quickcheck_shift

1085

if '..' not in s[0]:

1086

first = last = int(s[0], 16)

1087

else:

1088

first, last = [int(c, 16) for c in s[0].split('..')]

1089

for char in range(first, last+1):

1090

assert not (quickchecks[char]>>quickcheck_shift)&3

1091

quickchecks[char] |= quickcheck

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1092

for i in range(0, 0x110000):

1093

if table[i] is not None:

1094

table[i].append(quickchecks[i])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

1095

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1096

with open_data(UNIHAN, version) as file:

1097

zip = zipfile.ZipFile(file)

1098

if version == '3.2.0':

1099

data = zip.open('Unihan-3.2.0.txt').read()

1100

else:

1101

data = zip.open('Unihan_NumericValues.txt').read()

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1102

for line in data.decode("utf-8").splitlines():

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

1103

if not line.startswith('U+'):

1104

continue

1105

code, tag, value = line.split(None, 3)[:3]

1106

if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',

1107

'kOtherNumeric'):

1108

continue

1109

value = value.strip().replace(',', '')

1110

i = int(code[2:], 16)

1111

# Patch the numeric field

1112

if table[i] is not None:

1113

table[i][8] = value

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

1114

sc = self.special_casing = {}

1115

with open_data(SPECIAL_CASING, version) as file:

1116

for s in file:

1117

s = s[:-1].split('#', 1)[0]

if not s:

continue

data = s.split("; ")

if data[4]:

# We ignore all conditionals (since they depend on

1123

# languages) except for one, which is hardcoded. See

1124

# handle_capital_sigma in unicodeobject.c.

1125

continue

1126

c = int(data[0], 16)

1127

lower = [int(char, 16) for char in data[1].split()]

1128

title = [int(char, 16) for char in data[2].split()]

1129

upper = [int(char, 16) for char in data[3].split()]

1130

sc[c] = (lower, title, upper)

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

1131

cf = self.case_folding = {}

1132

if version != '3.2.0':

1133

with open_data(CASE_FOLDING, version) as file:

1134

for s in file:

1135

s = s[:-1].split('#', 1)[0]

if not s:

continue

data = s.split("; ")

if data[1] in "CF":

c = int(data[0], 16)

cf[c] = [int(char, 16) for char in data[2].split()]

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

1142

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1143

def uselatin1(self):

1144

# restrict character range to ISO Latin 1

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

1145

self.chars = list(range(256))

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1146

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1147

# hash table tools

1148

1149

# this is a straight-forward reimplementation of Python's built-in

1150

# dictionary type, using a static data structure, and a custom string

1151

# hash algorithm.

1152

1153

def myhash(s, magic):

1154

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1155

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1156

h = (h * magic) + c

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

1157

ix = h & 0xff000000

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1158

if ix:

1159

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

return h

SIZES = [

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

1164

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

1165

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

1166

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

]

class Hash:

def __init__(self, name, data, magic):

1171

# turn a (key, value) list into a static hash table structure

1172

1173

# determine table size

1174

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

Ezio Melotti

1392500

2011-03-16 11:05:33 +0200

[diff] [blame]

1179

raise AssertionError("ran out of polynomials")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1180

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1181

print(size, "slots in hash table")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1182

1183

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

1192

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

incr = (h ^ (h >> 3)) & mask;

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1213

print(n, "collisions")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1214

self.collisions = n

1215

1216

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1226

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1227

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1228

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1229

file.write("#define %s_magic %d\n" % (self.name, self.magic))

1230

file.write("#define %s_size %d\n" % (self.name, self.size))

1231

file.write("#define %s_poly %d\n" % (self.name, self.poly))

1232

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1233

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1241

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1242

# write data to file, as a C array

1243

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1244

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1245

print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1246

file.write("static ")

1247

if size == 1:

1248

file.write("unsigned char")

1249

elif size == 2:

1250

file.write("unsigned short")

1251

else:

1252

file.write("unsigned int")

1253

file.write(" " + self.name + "[] = {\n")

1254

if self.data:

1255

s = " "

1256

for item in self.data:

1257

i = str(item) + ", "

1258

if len(s) + len(i) > 78:

file.write(s + "\n")

s = " " + i

else:

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1263

if s.strip():

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

file.write(s + "\n")

file.write("};\n\n")

def getsize(data):

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1277

def splitbins(t, trace=0):

1278

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

1279

1280

t is a sequence of ints. This function can be useful to save space if

1281

many of the ints are the same. t1 and t2 are lists of ints, and shift

1282

is an int, chosen to minimize the combined size of t1 and t2 (in C

1283

code), and where for each i in range(len(t)),

1284

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1285

where mask is a bitmask isolating the last "shift" bits.

1286

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1287

If optional arg trace is non-zero (default zero), progress info

1288

is printed to sys.stderr. The higher the value, the more info

1289

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1290

"""

1291

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1292

if trace:

1293

def dump(t1, t2, shift, bytes):

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1294

print("%d+%d bins at shift %d; %d bytes" % (

1295

len(t1), len(t2), shift, bytes), file=sys.stderr)

1296

print("Size of original table:", len(t)*getsize(t), \

1297

"bytes", file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1298

n = len(t)-1 # last valid index

1299

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

Christian Heimes

a37d4c6

2007-12-04 23:02:19 +0000

[diff] [blame]

1305

bytes = sys.maxsize # smallest total size so far

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1306

t = tuple(t) # so slices can be dict keys

1307

for shift in range(maxshift + 1):

1308

t1 = []

1309

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1310

size = 2**shift

1311

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1312

for i in range(0, len(t), size):

1313

bin = t[i:i+size]

1314

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1315

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1316

index = len(t2)

1317

bincache[bin] = index

1318

t2.extend(bin)

1319

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1320

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1321

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1322

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1323

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1324

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1325

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1326

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1327

t1, t2, shift = best

1328

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1329

print("Best:", end=' ', file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1330

dump(t1, t2, shift, bytes)

1331

if __debug__:

1332

# exhaustively verify that the decomposition is correct

1333

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

Guido van Rossum

805365e

2007-05-07 22:24:25 +0000

[diff] [blame]

1334

for i in range(len(t)):

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1335

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1336

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1337

1338

if __name__ == "__main__":

Fredrik Lundh