Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython3

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

4

# this script converts a unicode 3.2 database file to

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

5

# Modules/unicodedata_db.h, Modules/unicodename_db.h,

6

# and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

7

#

8

# history:

9

# 2000-09-24 fl created (based on bits and pieces from unidb)

10

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

11

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

12

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

13

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

14

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

15

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

16

# 2002-09-11 wd use string methods

17

# 2002-10-18 mvl update to Unicode 3.2

18

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

19

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

20

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

21

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

22

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

23

# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

24

# 2011-10-21 ezio add support for name aliases and named sequences

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

25

# 2012-01 benjamin add full case mappings

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

26

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

27

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

28

#

29

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

import os

import sys

import zipfile

from textwrap import dedent

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

35

36

SCRIPT = sys.argv[0]

Amaury Forgeot d'Arc

324ac65

2010-08-18 20:44:58 +0000

[diff] [blame]

37

VERSION = "3.2"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

38

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

39

# The Unicode Database

R David Murray

7445a38

2014-10-09 17:30:33 -0400

[diff] [blame]

40

# --------------------

41

# When changing UCD version please update

42

# * Doc/library/stdtypes.rst, and

43

# * Doc/library/unicodedata.rst

R David Murray

5f16f90

2014-10-09 20:45:59 -0400

[diff] [blame]

44

# * Doc/reference/lexical_analysis.rst (two occurrences)

Benjamin Peterson

4801383

2015-06-27 15:45:56 -0500

[diff] [blame]

45

UNIDATA_VERSION = "8.0.0"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

46

UNICODE_DATA = "UnicodeData%s.txt"

47

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

48

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

49

UNIHAN = "Unihan%s.zip"

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

50

DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

51

DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

52

LINE_BREAK = "LineBreak%s.txt"

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

53

NAME_ALIASES = "NameAliases%s.txt"

54

NAMED_SEQUENCES = "NamedSequences%s.txt"

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

55

SPECIAL_CASING = "SpecialCasing%s.txt"

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

56

CASE_FOLDING = "CaseFolding%s.txt"

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

57

58

# Private Use Areas -- in planes 1, 15, 16

59

PUA_1 = range(0xE000, 0xF900)

60

PUA_15 = range(0xF0000, 0xFFFFE)

61

PUA_16 = range(0x100000, 0x10FFFE)

62

63

# we use this ranges of PUA_15 to store name aliases and named sequences

64

NAME_ALIASES_START = 0xF0000

Benjamin Peterson

71f660e

2012-02-20 22:24:29 -0500

[diff] [blame]

65

NAMED_SEQUENCES_START = 0xF0200

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

66

67

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

68

69

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

70

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

71

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

72

"So" ]

73

74

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

75

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

Benjamin Peterson

94d08d9

2013-10-10 17:24:45 -0400

[diff] [blame]

76

"ON", "LRI", "RLI", "FSI", "PDI" ]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

77

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

78

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

79

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

80

MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]

81

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

82

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

87

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

88

SPACE_MASK = 0x20

89

TITLE_MASK = 0x40

90

UPPER_MASK = 0x80

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

91

XID_START_MASK = 0x100

92

XID_CONTINUE_MASK = 0x200

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

93

PRINTABLE_MASK = 0x400

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

94

NUMERIC_MASK = 0x800

95

CASE_IGNORABLE_MASK = 0x1000

96

CASED_MASK = 0x2000

97

EXTENDED_CASE_MASK = 0x4000

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

98

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

99

# these ranges need to match unicodedata.c:is_unified_ideograph

100

cjk_ranges = [

101

('3400', '4DB5'),

Benjamin Peterson

4801383

2015-06-27 15:45:56 -0500

[diff] [blame]

102

('4E00', '9FD5'),

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

103

('20000', '2A6D6'),

104

('2A700', '2B734'),

Benjamin Peterson

4801383

2015-06-27 15:45:56 -0500

[diff] [blame]

105

('2B740', '2B81D'),

106

('2B820', '2CEA1'),

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

107

]

108

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

109

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

110

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

111

print("--- Reading", UNICODE_DATA % "", "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

112

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

113

version = ""

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

114

unicode = UnicodeData(UNIDATA_VERSION)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

115

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

116

print(len(list(filter(None, unicode.table))), "characters")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

117

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

118

for version in old_versions:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

119

print("--- Reading", UNICODE_DATA % ("-"+version), "...")

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

120

old_unicode = UnicodeData(version, cjk_check=False)

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

121

print(len(list(filter(None, old_unicode.table))), "characters")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

122

merge_old_version(version, unicode, old_unicode)

123

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

124

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

125

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

126

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

127

128

# --------------------------------------------------------------------

129

# unicode character properties

130

131

def makeunicodedata(unicode, trace):

132

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

133

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

134

table = [dummy]

135

cache = {0: dummy}

136

index = [0] * len(unicode.chars)

137

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

138

FILE = "Modules/unicodedata_db.h"

139

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

140

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

141

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

142

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

143

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

144

for char in unicode.chars:

145

record = unicode.table[char]

146

if record:

147

# extract database properties

148

category = CATEGORY_NAMES.index(record[2])

149

combining = int(record[3])

150

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

151

mirrored = record[9] == "Y"

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

152

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

153

normalizationquickcheck = record[17]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

154

item = (

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

155

category, combining, bidirectional, mirrored, eastasianwidth,

156

normalizationquickcheck

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

157

)

158

# add entry to index and item tables

159

i = cache.get(item)

160

if i is None:

161

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

165

# 2) decomposition data

166

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

167

decomp_data = [0]

168

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

169

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

170

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

171

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

172

comp_pairs = []

173

comp_first = [None] * len(unicode.chars)

174

comp_last = [None] * len(unicode.chars)

175

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

176

for char in unicode.chars:

177

record = unicode.table[char]

178

if record:

179

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

180

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

181

if len(decomp) > 19:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

182

raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

183

# prefix

184

if decomp[0][0] == "<":

185

prefix = decomp.pop(0)

186

else:

187

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

188

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

189

i = decomp_prefix.index(prefix)

190

except ValueError:

191

i = len(decomp_prefix)

192

decomp_prefix.append(prefix)

193

prefix = i

194

assert prefix < 256

195

# content

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

196

decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

197

# Collect NFC pairs

198

if not prefix and len(decomp) == 3 and \

199

char not in unicode.exclusions and \

200

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

205

try:

206

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

207

except ValueError:

208

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

209

decomp_data.extend(decomp)

210

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

211

else:

212

i = 0

213

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

214

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

215

f = l = 0

216

comp_first_ranges = []

217

comp_last_ranges = []

218

prev_f = prev_l = None

219

for i in unicode.chars:

220

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

226

prev_f = prev_f[0],i

227

else:

228

comp_first_ranges.append(prev_f)

229

prev_f = (i,i)

230

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

236

prev_l = prev_l[0],i

237

else:

238

comp_last_ranges.append(prev_l)

239

prev_l = (i,i)

240

comp_first_ranges.append(prev_f)

241

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

246

for f,l,char in comp_pairs:

247

f = comp_first[f]

248

l = comp_last[l]

249

comp_data[f*total_last+l] = char

250

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

251

print(len(table), "unique properties")

252

print(len(decomp_prefix), "unique decomposition prefixes")

253

print(len(decomp_data), "unique decomposition entries:", end=' ')

254

print(decomp_size, "bytes")

255

print(total_first, "first characters in NFC")

256

print(total_last, "last characters in NFC")

257

print(len(comp_pairs), "NFC pairs")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

258

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

259

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

260

Fred Drake

9c68505

2000-10-26 03:56:46 +0000

[diff] [blame]

261

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

262

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

263

print(file=fp)

264

print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)

265

print("/* a list of unique database records */", file=fp)

266

print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

267

for item in table:

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

268

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

269

print("};", file=fp)

270

print(file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

271

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

272

print("/* Reindexing of NFC first characters. */", file=fp)

273

print("#define TOTAL_FIRST",total_first, file=fp)

274

print("#define TOTAL_LAST",total_last, file=fp)

275

print("struct reindex{int start;short count,index;};", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

276

print("static struct reindex nfc_first[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

277

for start,end in comp_first_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

278

print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)

279

print(" {0,0,0}", file=fp)

280

print("};\n", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

281

print("static struct reindex nfc_last[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

282

for start,end in comp_last_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

283

print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)

284

print(" {0,0,0}", file=fp)

285

print("};\n", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

286

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

287

# FIXME: <fl> the following tables could be made static, and

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

288

# the support code moved into unicodedatabase.c

289

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

290

print("/* string literals */", file=fp)

291

print("const char *_PyUnicode_CategoryNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

292

for name in CATEGORY_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

293

print(" \"%s\"," % name, file=fp)

294

print(" NULL", file=fp)

295

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

296

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

297

print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

298

for name in BIDIRECTIONAL_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

299

print(" \"%s\"," % name, file=fp)

300

print(" NULL", file=fp)

301

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

302

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

303

print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

304

for name in EASTASIANWIDTH_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

305

print(" \"%s\"," % name, file=fp)

306

print(" NULL", file=fp)

307

print("};", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

308

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

309

print("static const char *decomp_prefix[] = {", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

310

for name in decomp_prefix:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

311

print(" \"%s\"," % name, file=fp)

312

print(" NULL", file=fp)

313

print("};", file=fp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

314

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

315

# split record index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

316

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

317

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

318

print("/* index tables for the database records */", file=fp)

319

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

320

Array("index1", index1).dump(fp, trace)

321

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

322

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

323

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

324

index1, index2, shift = splitbins(decomp_index, trace)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

325

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

326

print("/* decomposition data */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

327

Array("decomp_data", decomp_data).dump(fp, trace)

328

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

329

print("/* index tables for the decomposition data */", file=fp)

330

print("#define DECOMP_SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

331

Array("decomp_index1", index1).dump(fp, trace)

332

Array("decomp_index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

333

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

334

index, index2, shift = splitbins(comp_data, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

335

print("/* NFC pairs */", file=fp)

336

print("#define COMP_SHIFT", shift, file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

337

Array("comp_index", index).dump(fp, trace)

338

Array("comp_data", index2).dump(fp, trace)

339

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

340

# Generate delta tables for old versions

341

for version, table, normalization in unicode.changed:

342

cversion = version.replace(".","_")

343

records = [table[0]]

344

cache = {table[0]:0}

345

index = [0] * len(table)

346

for i, record in enumerate(table):

347

try:

348

index[i] = cache[record]

349

except KeyError:

350

index[i] = cache[record] = len(records)

351

records.append(record)

352

index1, index2, shift = splitbins(index, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

353

print("static const change_record change_records_%s[] = {" % cversion, file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

354

for record in records:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

355

print("\t{ %s }," % ", ".join(map(str,record)), file=fp)

356

print("};", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

357

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

358

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

359

print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)

360

print("{", file=fp)

361

print("\tint index;", file=fp)

362

print("\tif (n >= 0x110000) index = 0;", file=fp)

363

print("\telse {", file=fp)

364

print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)

365

print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \

366

(cversion, shift, ((1<<shift)-1)), file=fp)

367

print("\t}", file=fp)

368

print("\treturn change_records_%s+index;" % cversion, file=fp)

369

print("}\n", file=fp)

370

print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)

371

print("{", file=fp)

372

print("\tswitch(n) {", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

373

for k, v in normalization:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

374

print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)

375

print("\tdefault: return 0;", file=fp)

376

print("\t}\n}\n", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

377

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

378

fp.close()

379

380

# --------------------------------------------------------------------

381

# unicode character type tables

382

383

def makeunicodetype(unicode, trace):

384

385

FILE = "Objects/unicodetype_db.h"

386

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

387

print("--- Preparing", FILE, "...")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

388

389

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

390

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

391

table = [dummy]

392

cache = {0: dummy}

393

index = [0] * len(unicode.chars)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

394

numeric = {}

395

spaces = []

396

linebreaks = []

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

397

extra_casing = []

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

398

399

for char in unicode.chars:

400

record = unicode.table[char]

401

if record:

402

# extract database properties

403

category = record[2]

404

bidirectional = record[4]

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

405

properties = record[16]

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

406

flags = 0

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

407

delta = True

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

408

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

409

flags |= ALPHA_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

410

if "Lowercase" in properties:

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

411

flags |= LOWER_MASK

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

412

if 'Line_Break' in properties or bidirectional == "B":

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

413

flags |= LINEBREAK_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

414

linebreaks.append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

415

if category == "Zs" or bidirectional in ("WS", "B", "S"):

416

flags |= SPACE_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

417

spaces.append(char)

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

418

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

419

flags |= TITLE_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

420

if "Uppercase" in properties:

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

421

flags |= UPPER_MASK

Benjamin Peterson

0983274

2009-03-26 17:15:46 +0000

[diff] [blame]

422

if char == ord(" ") or category[0] not in ("C", "Z"):

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

423

flags |= PRINTABLE_MASK

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

424

if "XID_Start" in properties:

425

flags |= XID_START_MASK

426

if "XID_Continue" in properties:

427

flags |= XID_CONTINUE_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

428

if "Cased" in properties:

429

flags |= CASED_MASK

430

if "Case_Ignorable" in properties:

431

flags |= CASE_IGNORABLE_MASK

432

sc = unicode.special_casing.get(char)

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

433

cf = unicode.case_folding.get(char, [char])

434

if record[12]:

435

upper = int(record[12], 16)

else:

upper = char

if record[13]:

lower = int(record[13], 16)

else:

lower = char

if record[14]:

title = int(record[14], 16)

444

else:

445

title = upper

446

if sc is None and cf != [lower]:

447

sc = ([lower], [title], [upper])

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

448

if sc is None:

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

449

if upper == lower == title:

450

upper = lower = title = 0

Benjamin Peterson

ad9c569

2012-01-15 21:19:20 -0500

[diff] [blame]

else:

upper = upper - char

lower = lower - char

title = title - char

assert (abs(upper) <= 2147483647 and

456

abs(lower) <= 2147483647 and

457

abs(title) <= 2147483647)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

458

else:

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

459

# This happens either when some character maps to more than one

460

# character in uppercase, lowercase, or titlecase or the

461

# casefolded version of the character is different from the

462

# lowercase. The extra characters are stored in a different

463

# array.

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

464

flags |= EXTENDED_CASE_MASK

465

lower = len(extra_casing) | (len(sc[0]) << 24)

466

extra_casing.extend(sc[0])

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

467

if cf != sc[0]:

468

lower |= len(cf) << 20

469

extra_casing.extend(cf)

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

470

upper = len(extra_casing) | (len(sc[2]) << 24)

471

extra_casing.extend(sc[2])

472

# Title is probably equal to upper.

if sc[1] == sc[2]:

title = upper

else:

title = len(extra_casing) | (len(sc[1]) << 24)

477

extra_casing.extend(sc[1])

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

478

# decimal digit, integer digit

479

decimal = 0

480

if record[6]:

481

flags |= DECIMAL_MASK

482

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

487

if record[8]:

488

flags |= NUMERIC_MASK

489

numeric.setdefault(record[8], []).append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

490

item = (

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

491

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

492

)

493

# add entry to index and item tables

494

i = cache.get(item)

495

if i is None:

496

cache[item] = i = len(table)

table.append(item)

index[char] = i

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

500

print(len(table), "unique character type entries")

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

501

print(sum(map(len, numeric.values())), "numeric code points")

502

print(len(spaces), "whitespace code points")

503

print(len(linebreaks), "linebreak code points")

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

504

print(len(extra_casing), "extended case array")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

505

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

506

print("--- Writing", FILE, "...")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

507

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

508

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

509

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

510

print(file=fp)

511

print("/* a list of unique character type descriptors */", file=fp)

512

print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

513

for item in table:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

514

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

515

print("};", file=fp)

516

print(file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

517

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

518

print("/* extended case mappings */", file=fp)

519

print(file=fp)

520

print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)

521

for c in extra_casing:

522

print(" %d," % c, file=fp)

print("};", file=fp)

print(file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

526

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

527

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

528

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

529

print("/* type indexes */", file=fp)

530

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

531

Array("index1", index1).dump(fp, trace)

532

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

533

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

534

# Generate code for _PyUnicode_ToNumeric()

535

numeric_items = sorted(numeric.items())

536

print('/* Returns the numeric value as double for Unicode characters', file=fp)

537

print(' * having this property, -1.0 otherwise.', file=fp)

538

print(' */', file=fp)

Amaury Forgeot d'Arc

324ac65

2010-08-18 20:44:58 +0000

[diff] [blame]

539

print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

540

print('{', file=fp)

541

print(' switch (ch) {', file=fp)

542

for value, codepoints in numeric_items:

Amaury Forgeot d'Arc

919765a

2009-10-13 23:18:53 +0000

[diff] [blame]

543

# Turn text into float literals

544

parts = value.split('/')

545

parts = [repr(float(part)) for part in parts]

546

value = '/'.join(parts)

547

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

548

codepoints.sort()

549

for codepoint in codepoints:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

550

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

551

print(' return (double) %s;' % (value,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

552

print(' }', file=fp)

553

print(' return -1.0;', file=fp)

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsWhitespace()

558

print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)

559

print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)

560

print(" */", file=fp)

Antoine Pitrou

9ed5f27

2013-08-13 20:18:52 +0200

[diff] [blame]

561

print('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

562

print('{', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

563

print(' switch (ch) {', file=fp)

564

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

565

for codepoint in sorted(spaces):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

566

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

567

print(' return 1;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

568

569

print(' }', file=fp)

570

print(' return 0;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsLinebreak()

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

575

print("/* Returns 1 for Unicode characters having the line break", file=fp)

576

print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)

577

print(" * type 'B', 0 otherwise.", file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

578

print(" */", file=fp)

Antoine Pitrou

9ed5f27

2013-08-13 20:18:52 +0200

[diff] [blame]

579

print('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

580

print('{', file=fp)

581

print(' switch (ch) {', file=fp)

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

582

for codepoint in sorted(linebreaks):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

583

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

584

print(' return 1;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

585

586

print(' }', file=fp)

587

print(' return 0;', file=fp)

print('}', file=fp)

print(file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

591

fp.close()

592

593

# --------------------------------------------------------------------

594

# unicode name database

595

596

def makeunicodename(unicode, trace):

597

598

FILE = "Modules/unicodename_db.h"

599

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

600

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

601

602

# collect names

603

names = [None] * len(unicode.chars)

604

605

for char in unicode.chars:

606

record = unicode.table[char]

607

if record:

608

name = record[1].strip()

609

if name and name[0] != "<":

610

names[char] = name + chr(0)

611

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

612

print(len(list(n for n in names if n is not None)), "distinct names")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

613

614

# collect unique words from names (note that we differ between

615

# words inside a sentence, and words ending a sentence. the

616

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

632

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

633

print(n, "words in text;", b, "bytes")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

634

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

635

wordlist = list(words.items())

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

636

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

637

# sort on falling frequency, then by name

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

638

def word_key(a):

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

639

aword, alist = a

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

640

return -len(alist), aword

641

wordlist.sort(key=word_key)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

642

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

643

# figure out how many phrasebook escapes we need

644

escapes = 0

645

while escapes * 256 < len(wordlist):

646

escapes = escapes + 1

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

647

print(escapes, "escapes")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

648

649

short = 256 - escapes

assert short > 0

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

653

print(short, "short indexes in lexicon")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

654

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

655

# statistics

656

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

657

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

658

n = n + len(wordlist[i][1])

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

659

print(n, "short indexes in phrasebook")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

660

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

661

# pick the most commonly used words, and sort the rest on falling

662

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

663

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

664

wordlist, wordtail = wordlist[:short], wordlist[short:]

Raymond Hettinger

d4cb56d

2008-01-30 02:55:10 +0000

[diff] [blame]

665

wordtail.sort(key=lambda a: a[0], reverse=True)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

666

wordlist.extend(wordtail)

667

668

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

675

offset = 0

676

for w, x in wordlist:

677

# encoding: bit 7 indicates last character in word (chr(128)

678

# indicates the last character in an entire string)

679

ww = w[:-1] + chr(ord(w[-1])+128)

680

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

681

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

682

if o < 0:

683

o = offset

684

lexicon = lexicon + ww

685

offset = offset + len(w)

686

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

687

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

688

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

689

lexicon = list(map(ord, lexicon))

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

690

691

# generate phrasebook from names and lexicon

692

phrasebook = [0]

693

phrasebook_offset = [0] * len(unicode.chars)

694

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

699

for w in w:

700

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

701

if i < short:

702

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

703

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

704

# store as two bytes

705

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

706

phrasebook.append(i&255)

707

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

708

assert getsize(phrasebook) == 1

709

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

710

#

711

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

716

record = unicode.table[char]

717

if record:

718

name = record[1].strip()

719

if name and name[0] != "<":

720

data.append((name, char))

721

722

# the magic number 47 was chosen to minimize the number of

723

# collisions on the current data set. if you like, change it

724

# and see what happens...

725

726

codehash = Hash("code", data, 47)

727

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

728

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

729

730

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

731

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

732

print(file=fp)

733

print("#define NAME_MAXLEN", 256, file=fp)

734

print(file=fp)

735

print("/* lexicon */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

736

Array("lexicon", lexicon).dump(fp, trace)

737

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

738

739

# split decomposition index table

740

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

741

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

742

print("/* code->name phrasebook */", file=fp)

743

print("#define phrasebook_shift", shift, file=fp)

744

print("#define phrasebook_short", short, file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

745

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

746

Array("phrasebook", phrasebook).dump(fp, trace)

747

Array("phrasebook_offset1", offset1).dump(fp, trace)

748

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

749

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

750

print("/* name->code dictionary */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

751

codehash.dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

752

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

753

print(file=fp)

754

print('static const unsigned int aliases_start = %#x;' %

755

NAME_ALIASES_START, file=fp)

756

print('static const unsigned int aliases_end = %#x;' %

757

(NAME_ALIASES_START + len(unicode.aliases)), file=fp)

758

759

print('static const unsigned int name_aliases[] = {', file=fp)

760

for name, codepoint in unicode.aliases:

761

print(' 0x%04X,' % codepoint, file=fp)

762

print('};', file=fp)

763

764

# In Unicode 6.0.0, the sequences contain at most 4 BMP chars,

765

# so we are using Py_UCS2 seq[4]. This needs to be updated if longer

766

# sequences or sequences with non-BMP chars are added.

767

# unicodedata_lookup should be adapted too.

768

print(dedent("""

769

typedef struct NamedSequence {

int seqlen;

Py_UCS2 seq[4];

} named_sequence;

"""), file=fp)

print('static const unsigned int named_sequences_start = %#x;' %

776

NAMED_SEQUENCES_START, file=fp)

777

print('static const unsigned int named_sequences_end = %#x;' %

778

(NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)

779

780

print('static const named_sequence named_sequences[] = {', file=fp)

781

for name, sequence in unicode.named_sequences:

782

seq_str = ', '.join('0x%04X' % cp for cp in sequence)

783

print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)

784

print('};', file=fp)

785

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

786

fp.close()

787

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

788

789

def merge_old_version(version, new, old):

790

# Changes to exclusion file not implemented yet

791

if old.exclusions != new.exclusions:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

792

raise NotImplementedError("exclusions differ")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

793

794

# In these change records, 0xFF means "no change"

795

bidir_changes = [0xFF]*0x110000

796

category_changes = [0xFF]*0x110000

797

decimal_changes = [0xFF]*0x110000

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

798

mirrored_changes = [0xFF]*0x110000

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

799

# In numeric data, 0 means "no change",

800

# -1 means "did not have a numeric value

801

numeric_changes = [0] * 0x110000

802

# normalization_changes is a list of key-value pairs

803

normalization_changes = []

804

for i in range(0x110000):

805

if new.table[i] is None:

806

# Characters unassigned in the new version ought to

807

# be unassigned in the old one

808

assert old.table[i] is None

809

continue

810

# check characters unassigned in the old version

811

if old.table[i] is None:

812

# category 0 is "unassigned"

813

category_changes[i] = 0

814

continue

815

# check characters that differ

816

if old.table[i] != new.table[i]:

817

for k in range(len(old.table[i])):

818

if old.table[i][k] != new.table[i][k]:

819

value = old.table[i][k]

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

820

if k == 1 and i in PUA_15:

821

# the name is not set in the old.table, but in the

822

# new.table we are using it for aliases and named seq

823

assert value == ''

824

elif k == 2:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

825

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

826

category_changes[i] = CATEGORY_NAMES.index(value)

827

elif k == 4:

828

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

829

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

830

elif k == 5:

831

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

832

# We assume that all normalization changes are in 1:1 mappings

833

assert " " not in value

834

normalization_changes.append((i, value))

835

elif k == 6:

836

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

837

# we only support changes where the old value is a single digit

838

assert value in "0123456789"

839

decimal_changes[i] = int(value)

840

elif k == 8:

841

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

842

# Since 0 encodes "no change", the old value is better not 0

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

843

if not value:

844

numeric_changes[i] = -1

845

else:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

846

numeric_changes[i] = float(value)

847

assert numeric_changes[i] not in (0, -1)

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

848

elif k == 9:

849

if value == 'Y':

850

mirrored_changes[i] = '1'

851

else:

852

mirrored_changes[i] = '0'

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

853

elif k == 11:

854

# change to ISO comment, ignore

855

pass

856

elif k == 12:

857

# change to simple uppercase mapping; ignore

858

pass

859

elif k == 13:

860

# change to simple lowercase mapping; ignore

861

pass

862

elif k == 14:

863

# change to simple titlecase mapping; ignore

864

pass

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

865

elif k == 16:

866

# derived property changes; not yet

867

pass

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

868

elif k == 17:

869

# normalization quickchecks are not performed

870

# for older versions

871

pass

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

872

else:

873

class Difference(Exception):pass

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

874

raise Difference(hex(i), k, old.table[i], new.table[i])

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

875

new.changed.append((version, list(zip(bidir_changes, category_changes,

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

876

decimal_changes, mirrored_changes,

877

numeric_changes)),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

878

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

879

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

880

def open_data(template, version):

881

local = template % ('-'+version,)

882

if not os.path.exists(local):

883

import urllib.request

884

if version == '3.2.0':

885

# irregular url structure

886

url = 'http://www.unicode.org/Public/3.2-Update/' + local

887

else:

888

url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')

889

urllib.request.urlretrieve(url, filename=local)

890

if local.endswith('.txt'):

891

return open(local, encoding='utf-8')

892

else:

893

# Unihan.zip

894

return open(local, 'rb')

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

895

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

896

# --------------------------------------------------------------------

897

# the following support code is taken from the unidb utilities

898

899

900

# load a unicode-data file from disk

901

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

902

class UnicodeData:

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

903

# Record structure:

904

# [ID, name, category, combining, bidi, decomp, (6)

905

# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)

906

# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)

907

# derived-props] (17)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

908

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

909

def __init__(self, version,

910

linebreakprops=False,

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

911

expand=1,

912

cjk_check=True):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

913

self.changed = []

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

914

table = [None] * 0x110000

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

915

with open_data(UNICODE_DATA, version) as file:

while 1:

s = file.readline()

if not s:

break

s = s.strip().split(";")

921

char = int(s[0], 16)

922

table[char] = s

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

923

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

924

cjk_ranges_found = []

925

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

926

# expand first-last ranges

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

927

if expand:

928

field = None

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

929

for i in range(0, 0x110000):

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

930

s = table[i]

931

if s:

932

if s[1][-6:] == "First>":

933

s[1] = ""

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

934

field = s

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

935

elif s[1][-5:] == "Last>":

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

936

if s[1].startswith("<CJK Ideograph"):

937

cjk_ranges_found.append((field[0],

938

s[0]))

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

939

s[1] = ""

940

field = None

941

elif field:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

942

f2 = field[:]

943

f2[0] = "%X" % i

944

table[i] = f2

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

945

if cjk_check and cjk_ranges != cjk_ranges_found:

946

raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

947

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

948

# public attributes

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

949

self.filename = UNICODE_DATA % ''

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

950

self.table = table

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

951

self.chars = list(range(0x110000)) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

952

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

953

# check for name aliases and named sequences, see #12753

954

# aliases and named sequences are not in 3.2.0

955

if version != '3.2.0':

956

self.aliases = []

957

# store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,

958

# in order to take advantage of the compression and lookup

959

# algorithms used for the other characters

960

pua_index = NAME_ALIASES_START

961

with open_data(NAME_ALIASES, version) as file:

962

for s in file:

963

s = s.strip()

964

if not s or s.startswith('#'):

965

continue

Benjamin Peterson

71f660e

2012-02-20 22:24:29 -0500

[diff] [blame]

966

char, name, abbrev = s.split(';')

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

967

char = int(char, 16)

968

self.aliases.append((name, char))

969

# also store the name in the PUA 1

970

self.table[pua_index][1] = name

971

pua_index += 1

972

assert pua_index - NAME_ALIASES_START == len(self.aliases)

973

974

self.named_sequences = []

Ezio Melotti

7c4a7e6

2013-08-26 01:32:56 +0300

[diff] [blame]

975

# store named sequences in the PUA 1, in range U+F0100..,

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

976

# in order to take advantage of the compression and lookup

977

# algorithms used for the other characters.

978

Benjamin Peterson

71f660e

2012-02-20 22:24:29 -0500

[diff] [blame]

979

assert pua_index < NAMED_SEQUENCES_START

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

980

pua_index = NAMED_SEQUENCES_START

981

with open_data(NAMED_SEQUENCES, version) as file:

982

for s in file:

983

s = s.strip()

984

if not s or s.startswith('#'):

985

continue

986

name, chars = s.split(';')

987

chars = tuple(int(char, 16) for char in chars.split())

988

# check that the structure defined in makeunicodename is OK

989

assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"

990

assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "

991

"the NamedSequence struct and in unicodedata_lookup")

992

self.named_sequences.append((name, chars))

993

# also store these in the PUA 1

994

self.table[pua_index][1] = name

995

pua_index += 1

996

assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)

997

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

998

self.exclusions = {}

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

999

with open_data(COMPOSITION_EXCLUSIONS, version) as file:

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

char = int(s.split()[0],16)

1007

self.exclusions[char] = 1

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

1008

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

1009

widths = [None] * 0x110000

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1010

with open_data(EASTASIAN_WIDTH, version) as file:

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

s = s.split()[0].split(';')

1018

if '..' in s[0]:

1019

first, last = [int(c, 16) for c in s[0].split('..')]

1020

chars = list(range(first, last+1))

1021

else:

1022

chars = [int(s[0], 16)]

for char in chars:

widths[char] = s[1]

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

1026

for i in range(0, 0x110000):

1027

if table[i] is not None:

1028

table[i].append(widths[i])

1029

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1030

for i in range(0, 0x110000):

1031

if table[i] is not None:

1032

table[i].append(set())

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1033

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1034

with open_data(DERIVED_CORE_PROPERTIES, version) as file:

1035

for s in file:

1036

s = s.split('#', 1)[0].strip()

1037

if not s:

1038

continue

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1039

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

r, p = s.split(";")

r = r.strip()

p = p.strip()

if ".." in r:

first, last = [int(c, 16) for c in r.split('..')]

1045

chars = list(range(first, last+1))

else:

chars = [int(r, 16)]

for char in chars:

if table[char]:

# Some properties (e.g. Default_Ignorable_Code_Point)

1051

# apply to unassigned code points; ignore them

1052

table[char][-1].add(p)

1053

1054

with open_data(LINE_BREAK, version) as file:

1055

for s in file:

1056

s = s.partition('#')[0]

1057

s = [i.strip() for i in s.split(';')]

1058

if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:

1059

continue

1060

if '..' not in s[0]:

1061

first = last = int(s[0], 16)

1062

else:

1063

first, last = [int(c, 16) for c in s[0].split('..')]

1064

for char in range(first, last+1):

1065

table[char][-1].add('Line_Break')

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

1066

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1067

# We only want the quickcheck properties

1068

# Format: NF?_QC; Y(es)/N(o)/M(aybe)

1069

# Yes is the default, hence only N and M occur

1070

# In 3.2.0, the format was different (NF?_NO)

1071

# The parsing will incorrectly determine these as

1072

# "yes", however, unicodedata.c will not perform quickchecks

1073

# for older versions, and no delta records will be created.

1074

quickchecks = [0] * 0x110000

1075

qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1076

with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:

for s in file:

if '#' in s:

s = s[:s.index('#')]

s = [i.strip() for i in s.split(';')]

1081

if len(s) < 2 or s[1] not in qc_order:

1082

continue

1083

quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No

1084

quickcheck_shift = qc_order.index(s[1])*2

1085

quickcheck <<= quickcheck_shift

1086

if '..' not in s[0]:

1087

first = last = int(s[0], 16)

1088

else:

1089

first, last = [int(c, 16) for c in s[0].split('..')]

1090

for char in range(first, last+1):

1091

assert not (quickchecks[char]>>quickcheck_shift)&3

1092

quickchecks[char] |= quickcheck

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1093

for i in range(0, 0x110000):

1094

if table[i] is not None:

1095

table[i].append(quickchecks[i])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

1096

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1097

with open_data(UNIHAN, version) as file:

1098

zip = zipfile.ZipFile(file)

1099

if version == '3.2.0':

1100

data = zip.open('Unihan-3.2.0.txt').read()

1101

else:

1102

data = zip.open('Unihan_NumericValues.txt').read()

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1103

for line in data.decode("utf-8").splitlines():

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

1104

if not line.startswith('U+'):

1105

continue

1106

code, tag, value = line.split(None, 3)[:3]

1107

if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',

1108

'kOtherNumeric'):

1109

continue

1110

value = value.strip().replace(',', '')

1111

i = int(code[2:], 16)

1112

# Patch the numeric field

1113

if table[i] is not None:

1114

table[i][8] = value

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

1115

sc = self.special_casing = {}

1116

with open_data(SPECIAL_CASING, version) as file:

1117

for s in file:

1118

s = s[:-1].split('#', 1)[0]

if not s:

continue

data = s.split("; ")

if data[4]:

# We ignore all conditionals (since they depend on

1124

# languages) except for one, which is hardcoded. See

1125

# handle_capital_sigma in unicodeobject.c.

1126

continue

1127

c = int(data[0], 16)

1128

lower = [int(char, 16) for char in data[1].split()]

1129

title = [int(char, 16) for char in data[2].split()]

1130

upper = [int(char, 16) for char in data[3].split()]

1131

sc[c] = (lower, title, upper)

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

1132

cf = self.case_folding = {}

1133

if version != '3.2.0':

1134

with open_data(CASE_FOLDING, version) as file:

1135

for s in file:

1136

s = s[:-1].split('#', 1)[0]

if not s:

continue

data = s.split("; ")

if data[1] in "CF":

c = int(data[0], 16)

cf[c] = [int(char, 16) for char in data[2].split()]

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

1143

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1144

def uselatin1(self):

1145

# restrict character range to ISO Latin 1

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

1146

self.chars = list(range(256))

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1147

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1148

# hash table tools

1149

1150

# this is a straight-forward reimplementation of Python's built-in

1151

# dictionary type, using a static data structure, and a custom string

1152

# hash algorithm.

1153

1154

def myhash(s, magic):

1155

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1156

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1157

h = (h * magic) + c

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

1158

ix = h & 0xff000000

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1159

if ix:

1160

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

return h

SIZES = [

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

1165

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

1166

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

1167

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

]

class Hash:

def __init__(self, name, data, magic):

1172

# turn a (key, value) list into a static hash table structure

1173

1174

# determine table size

1175

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

Ezio Melotti

1392500

2011-03-16 11:05:33 +0200

[diff] [blame]

1180

raise AssertionError("ran out of polynomials")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1181

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1182

print(size, "slots in hash table")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1183

1184

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

1193

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

incr = (h ^ (h >> 3)) & mask;

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1214

print(n, "collisions")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1215

self.collisions = n

1216

1217

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1227

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1228

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1229

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1230

file.write("#define %s_magic %d\n" % (self.name, self.magic))

1231

file.write("#define %s_size %d\n" % (self.name, self.size))

1232

file.write("#define %s_poly %d\n" % (self.name, self.poly))

1233

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1234

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1242

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1243

# write data to file, as a C array

1244

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1245

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1246

print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1247

file.write("static ")

1248

if size == 1:

1249

file.write("unsigned char")

1250

elif size == 2:

1251

file.write("unsigned short")

1252

else:

1253

file.write("unsigned int")

1254

file.write(" " + self.name + "[] = {\n")

1255

if self.data:

1256

s = " "

1257

for item in self.data:

1258

i = str(item) + ", "

1259

if len(s) + len(i) > 78:

file.write(s + "\n")

s = " " + i

else:

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1264

if s.strip():

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

file.write(s + "\n")

file.write("};\n\n")

def getsize(data):

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1278

def splitbins(t, trace=0):

1279

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

1280

1281

t is a sequence of ints. This function can be useful to save space if

1282

many of the ints are the same. t1 and t2 are lists of ints, and shift

1283

is an int, chosen to minimize the combined size of t1 and t2 (in C

1284

code), and where for each i in range(len(t)),

1285

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1286

where mask is a bitmask isolating the last "shift" bits.

1287

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1288

If optional arg trace is non-zero (default zero), progress info

1289

is printed to sys.stderr. The higher the value, the more info

1290

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1291

"""

1292

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1293

if trace:

1294

def dump(t1, t2, shift, bytes):

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1295

print("%d+%d bins at shift %d; %d bytes" % (

1296

len(t1), len(t2), shift, bytes), file=sys.stderr)

1297

print("Size of original table:", len(t)*getsize(t), \

1298

"bytes", file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1299

n = len(t)-1 # last valid index

1300

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

Christian Heimes

a37d4c6

2007-12-04 23:02:19 +0000

[diff] [blame]

1306

bytes = sys.maxsize # smallest total size so far

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1307

t = tuple(t) # so slices can be dict keys

1308

for shift in range(maxshift + 1):

1309

t1 = []

1310

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1311

size = 2**shift

1312

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1313

for i in range(0, len(t), size):

1314

bin = t[i:i+size]

1315

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1316

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1317

index = len(t2)

1318

bincache[bin] = index

1319

t2.extend(bin)

1320

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1321

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1322

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1323

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1324

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1325

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1326

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1327

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1328

t1, t2, shift = best

1329

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1330

print("Best:", end=' ', file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1331

dump(t1, t2, shift, bytes)

1332

if __debug__:

1333

# exhaustively verify that the decomposition is correct

1334

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

Guido van Rossum

805365e

2007-05-07 22:24:25 +0000

[diff] [blame]

1335

for i in range(len(t)):

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1336

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1337

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1338

1339

if __name__ == "__main__":

Fredrik Lundh