Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython3

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

4

# this script converts a unicode 3.2 database file to

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

5

# Modules/unicodedata_db.h, Modules/unicodename_db.h,

6

# and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

7

#

8

# history:

9

# 2000-09-24 fl created (based on bits and pieces from unidb)

10

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

11

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

12

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

13

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

14

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

15

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

16

# 2002-09-11 wd use string methods

17

# 2002-10-18 mvl update to Unicode 3.2

18

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

19

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

20

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

21

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

22

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

23

# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

24

# 2011-10-21 ezio add support for name aliases and named sequences

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

25

# 2012-01 benjamin add full case mappings

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

26

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

27

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

28

#

29

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

import os

import sys

import zipfile

from textwrap import dedent

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

35

36

SCRIPT = sys.argv[0]

Amaury Forgeot d'Arc

324ac65

2010-08-18 20:44:58 +0000

[diff] [blame]

37

VERSION = "3.2"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

38

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

39

# The Unicode Database

R David Murray

7445a38

2014-10-09 17:30:33 -0400

[diff] [blame]

40

# --------------------

41

# When changing UCD version please update

42

# * Doc/library/stdtypes.rst, and

43

# * Doc/library/unicodedata.rst

R David Murray

5f16f90

2014-10-09 20:45:59 -0400

[diff] [blame]

44

# * Doc/reference/lexical_analysis.rst (two occurrences)

Benjamin Peterson

2017-06-22 22:31:08 -0700

[diff] [blame]

45

UNIDATA_VERSION = "10.0.0"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

46

UNICODE_DATA = "UnicodeData%s.txt"

47

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

48

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

49

UNIHAN = "Unihan%s.zip"

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

50

DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

51

DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

52

LINE_BREAK = "LineBreak%s.txt"

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

53

NAME_ALIASES = "NameAliases%s.txt"

54

NAMED_SEQUENCES = "NamedSequences%s.txt"

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

55

SPECIAL_CASING = "SpecialCasing%s.txt"

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

56

CASE_FOLDING = "CaseFolding%s.txt"

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

57

58

# Private Use Areas -- in planes 1, 15, 16

59

PUA_1 = range(0xE000, 0xF900)

60

PUA_15 = range(0xF0000, 0xFFFFE)

61

PUA_16 = range(0x100000, 0x10FFFE)

62

63

# we use this ranges of PUA_15 to store name aliases and named sequences

64

NAME_ALIASES_START = 0xF0000

Benjamin Peterson

71f660e

2012-02-20 22:24:29 -0500

[diff] [blame]

65

NAMED_SEQUENCES_START = 0xF0200

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

66

67

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

68

69

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

70

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

71

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

72

"So" ]

73

74

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

75

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

Benjamin Peterson

94d08d9

2013-10-10 17:24:45 -0400

[diff] [blame]

76

"ON", "LRI", "RLI", "FSI", "PDI" ]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

77

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

78

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

79

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

80

MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]

81

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

82

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

87

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

88

SPACE_MASK = 0x20

89

TITLE_MASK = 0x40

90

UPPER_MASK = 0x80

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

91

XID_START_MASK = 0x100

92

XID_CONTINUE_MASK = 0x200

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

93

PRINTABLE_MASK = 0x400

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

94

NUMERIC_MASK = 0x800

95

CASE_IGNORABLE_MASK = 0x1000

96

CASED_MASK = 0x2000

97

EXTENDED_CASE_MASK = 0x4000

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

98

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

99

# these ranges need to match unicodedata.c:is_unified_ideograph

100

cjk_ranges = [

101

('3400', '4DB5'),

Benjamin Peterson

2017-06-22 22:31:08 -0700

[diff] [blame]

102

('4E00', '9FEA'),

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

103

('20000', '2A6D6'),

104

('2A700', '2B734'),

Benjamin Peterson

4801383

2015-06-27 15:45:56 -0500

[diff] [blame]

105

('2B740', '2B81D'),

106

('2B820', '2CEA1'),

Benjamin Peterson

2017-06-22 22:31:08 -0700

[diff] [blame]

107

('2CEB0', '2EBE0'),

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

108

]

109

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

110

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

111

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

112

print("--- Reading", UNICODE_DATA % "", "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

113

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

114

version = ""

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

115

unicode = UnicodeData(UNIDATA_VERSION)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

116

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

117

print(len(list(filter(None, unicode.table))), "characters")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

118

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

119

for version in old_versions:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

120

print("--- Reading", UNICODE_DATA % ("-"+version), "...")

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

121

old_unicode = UnicodeData(version, cjk_check=False)

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

122

print(len(list(filter(None, old_unicode.table))), "characters")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

123

merge_old_version(version, unicode, old_unicode)

124

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

125

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

126

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

127

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

128

129

# --------------------------------------------------------------------

130

# unicode character properties

131

132

def makeunicodedata(unicode, trace):

133

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

134

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

135

table = [dummy]

136

cache = {0: dummy}

137

index = [0] * len(unicode.chars)

138

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

139

FILE = "Modules/unicodedata_db.h"

140

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

141

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

142

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

143

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

144

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

145

for char in unicode.chars:

146

record = unicode.table[char]

147

if record:

148

# extract database properties

149

category = CATEGORY_NAMES.index(record[2])

150

combining = int(record[3])

151

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

152

mirrored = record[9] == "Y"

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

153

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

154

normalizationquickcheck = record[17]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

155

item = (

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

156

category, combining, bidirectional, mirrored, eastasianwidth,

157

normalizationquickcheck

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

158

)

159

# add entry to index and item tables

160

i = cache.get(item)

161

if i is None:

162

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

166

# 2) decomposition data

167

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

168

decomp_data = [0]

169

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

170

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

171

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

172

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

173

comp_pairs = []

174

comp_first = [None] * len(unicode.chars)

175

comp_last = [None] * len(unicode.chars)

176

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

177

for char in unicode.chars:

178

record = unicode.table[char]

179

if record:

180

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

181

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

182

if len(decomp) > 19:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

183

raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

184

# prefix

185

if decomp[0][0] == "<":

186

prefix = decomp.pop(0)

187

else:

188

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

189

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

190

i = decomp_prefix.index(prefix)

191

except ValueError:

192

i = len(decomp_prefix)

193

decomp_prefix.append(prefix)

194

prefix = i

195

assert prefix < 256

196

# content

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

197

decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

198

# Collect NFC pairs

199

if not prefix and len(decomp) == 3 and \

200

char not in unicode.exclusions and \

201

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

206

try:

207

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

208

except ValueError:

209

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

210

decomp_data.extend(decomp)

211

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

212

else:

213

i = 0

214

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

215

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

216

f = l = 0

217

comp_first_ranges = []

218

comp_last_ranges = []

219

prev_f = prev_l = None

220

for i in unicode.chars:

221

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

227

prev_f = prev_f[0],i

228

else:

229

comp_first_ranges.append(prev_f)

230

prev_f = (i,i)

231

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

237

prev_l = prev_l[0],i

238

else:

239

comp_last_ranges.append(prev_l)

240

prev_l = (i,i)

241

comp_first_ranges.append(prev_f)

242

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

247

for f,l,char in comp_pairs:

248

f = comp_first[f]

249

l = comp_last[l]

250

comp_data[f*total_last+l] = char

251

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

252

print(len(table), "unique properties")

253

print(len(decomp_prefix), "unique decomposition prefixes")

254

print(len(decomp_data), "unique decomposition entries:", end=' ')

255

print(decomp_size, "bytes")

256

print(total_first, "first characters in NFC")

257

print(total_last, "last characters in NFC")

258

print(len(comp_pairs), "NFC pairs")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

259

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

260

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

261

Fred Drake

9c68505

2000-10-26 03:56:46 +0000

[diff] [blame]

262

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

263

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

264

print(file=fp)

265

print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)

266

print("/* a list of unique database records */", file=fp)

267

print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

268

for item in table:

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

269

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

270

print("};", file=fp)

271

print(file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

272

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

273

print("/* Reindexing of NFC first characters. */", file=fp)

274

print("#define TOTAL_FIRST",total_first, file=fp)

275

print("#define TOTAL_LAST",total_last, file=fp)

276

print("struct reindex{int start;short count,index;};", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

277

print("static struct reindex nfc_first[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

278

for start,end in comp_first_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

279

print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)

280

print(" {0,0,0}", file=fp)

281

print("};\n", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

282

print("static struct reindex nfc_last[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

283

for start,end in comp_last_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

284

print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)

285

print(" {0,0,0}", file=fp)

286

print("};\n", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

287

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

288

# FIXME: <fl> the following tables could be made static, and

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

289

# the support code moved into unicodedatabase.c

290

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

291

print("/* string literals */", file=fp)

292

print("const char *_PyUnicode_CategoryNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

293

for name in CATEGORY_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

294

print(" \"%s\"," % name, file=fp)

295

print(" NULL", file=fp)

296

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

297

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

298

print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

299

for name in BIDIRECTIONAL_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

300

print(" \"%s\"," % name, file=fp)

301

print(" NULL", file=fp)

302

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

303

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

304

print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

305

for name in EASTASIANWIDTH_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

306

print(" \"%s\"," % name, file=fp)

307

print(" NULL", file=fp)

308

print("};", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

309

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

310

print("static const char *decomp_prefix[] = {", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

311

for name in decomp_prefix:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

312

print(" \"%s\"," % name, file=fp)

313

print(" NULL", file=fp)

314

print("};", file=fp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

315

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

316

# split record index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

317

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

318

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

319

print("/* index tables for the database records */", file=fp)

320

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

321

Array("index1", index1).dump(fp, trace)

322

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

323

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

324

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

325

index1, index2, shift = splitbins(decomp_index, trace)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

326

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

327

print("/* decomposition data */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

328

Array("decomp_data", decomp_data).dump(fp, trace)

329

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

330

print("/* index tables for the decomposition data */", file=fp)

331

print("#define DECOMP_SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

332

Array("decomp_index1", index1).dump(fp, trace)

333

Array("decomp_index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

334

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

335

index, index2, shift = splitbins(comp_data, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

336

print("/* NFC pairs */", file=fp)

337

print("#define COMP_SHIFT", shift, file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

338

Array("comp_index", index).dump(fp, trace)

339

Array("comp_data", index2).dump(fp, trace)

340

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

341

# Generate delta tables for old versions

342

for version, table, normalization in unicode.changed:

343

cversion = version.replace(".","_")

344

records = [table[0]]

345

cache = {table[0]:0}

346

index = [0] * len(table)

347

for i, record in enumerate(table):

348

try:

349

index[i] = cache[record]

350

except KeyError:

351

index[i] = cache[record] = len(records)

352

records.append(record)

353

index1, index2, shift = splitbins(index, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

354

print("static const change_record change_records_%s[] = {" % cversion, file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

355

for record in records:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

356

print("\t{ %s }," % ", ".join(map(str,record)), file=fp)

357

print("};", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

358

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

359

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

360

print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)

361

print("{", file=fp)

362

print("\tint index;", file=fp)

363

print("\tif (n >= 0x110000) index = 0;", file=fp)

364

print("\telse {", file=fp)

365

print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)

366

print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \

367

(cversion, shift, ((1<<shift)-1)), file=fp)

368

print("\t}", file=fp)

369

print("\treturn change_records_%s+index;" % cversion, file=fp)

370

print("}\n", file=fp)

371

print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)

372

print("{", file=fp)

373

print("\tswitch(n) {", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

374

for k, v in normalization:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

375

print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)

376

print("\tdefault: return 0;", file=fp)

377

print("\t}\n}\n", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

378

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

379

fp.close()

380

381

# --------------------------------------------------------------------

382

# unicode character type tables

383

384

def makeunicodetype(unicode, trace):

385

386

FILE = "Objects/unicodetype_db.h"

387

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

388

print("--- Preparing", FILE, "...")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

389

390

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

391

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

392

table = [dummy]

393

cache = {0: dummy}

394

index = [0] * len(unicode.chars)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

395

numeric = {}

396

spaces = []

397

linebreaks = []

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

398

extra_casing = []

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

399

400

for char in unicode.chars:

401

record = unicode.table[char]

402

if record:

403

# extract database properties

404

category = record[2]

405

bidirectional = record[4]

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

406

properties = record[16]

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

407

flags = 0

Martin v. Löwis

93cbca3

2008-09-10 14:08:48 +0000

[diff] [blame]

408

delta = True

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

409

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

410

flags |= ALPHA_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

411

if "Lowercase" in properties:

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

412

flags |= LOWER_MASK

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

413

if 'Line_Break' in properties or bidirectional == "B":

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

414

flags |= LINEBREAK_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

415

linebreaks.append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

416

if category == "Zs" or bidirectional in ("WS", "B", "S"):

417

flags |= SPACE_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

418

spaces.append(char)

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

419

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

420

flags |= TITLE_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

421

if "Uppercase" in properties:

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

422

flags |= UPPER_MASK

Benjamin Peterson

0983274

2009-03-26 17:15:46 +0000

[diff] [blame]

423

if char == ord(" ") or category[0] not in ("C", "Z"):

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

424

flags |= PRINTABLE_MASK

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

425

if "XID_Start" in properties:

426

flags |= XID_START_MASK

427

if "XID_Continue" in properties:

428

flags |= XID_CONTINUE_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

429

if "Cased" in properties:

430

flags |= CASED_MASK

431

if "Case_Ignorable" in properties:

432

flags |= CASE_IGNORABLE_MASK

433

sc = unicode.special_casing.get(char)

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

434

cf = unicode.case_folding.get(char, [char])

435

if record[12]:

436

upper = int(record[12], 16)

else:

upper = char

if record[13]:

lower = int(record[13], 16)

else:

lower = char

if record[14]:

title = int(record[14], 16)

445

else:

446

title = upper

447

if sc is None and cf != [lower]:

448

sc = ([lower], [title], [upper])

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

449

if sc is None:

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

450

if upper == lower == title:

451

upper = lower = title = 0

Benjamin Peterson

ad9c569

2012-01-15 21:19:20 -0500

[diff] [blame]

else:

upper = upper - char

lower = lower - char

title = title - char

assert (abs(upper) <= 2147483647 and

457

abs(lower) <= 2147483647 and

458

abs(title) <= 2147483647)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

459

else:

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

460

# This happens either when some character maps to more than one

461

# character in uppercase, lowercase, or titlecase or the

462

# casefolded version of the character is different from the

463

# lowercase. The extra characters are stored in a different

464

# array.

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

465

flags |= EXTENDED_CASE_MASK

466

lower = len(extra_casing) | (len(sc[0]) << 24)

467

extra_casing.extend(sc[0])

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

468

if cf != sc[0]:

469

lower |= len(cf) << 20

470

extra_casing.extend(cf)

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

471

upper = len(extra_casing) | (len(sc[2]) << 24)

472

extra_casing.extend(sc[2])

473

# Title is probably equal to upper.

if sc[1] == sc[2]:

title = upper

else:

title = len(extra_casing) | (len(sc[1]) << 24)

478

extra_casing.extend(sc[1])

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

479

# decimal digit, integer digit

480

decimal = 0

481

if record[6]:

482

flags |= DECIMAL_MASK

483

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

488

if record[8]:

489

flags |= NUMERIC_MASK

490

numeric.setdefault(record[8], []).append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

491

item = (

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

492

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

493

)

494

# add entry to index and item tables

495

i = cache.get(item)

496

if i is None:

497

cache[item] = i = len(table)

table.append(item)

index[char] = i

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

501

print(len(table), "unique character type entries")

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

502

print(sum(map(len, numeric.values())), "numeric code points")

503

print(len(spaces), "whitespace code points")

504

print(len(linebreaks), "linebreak code points")

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

505

print(len(extra_casing), "extended case array")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

506

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

507

print("--- Writing", FILE, "...")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

508

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

509

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

510

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

511

print(file=fp)

512

print("/* a list of unique character type descriptors */", file=fp)

513

print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

514

for item in table:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

515

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

516

print("};", file=fp)

517

print(file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

518

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

519

print("/* extended case mappings */", file=fp)

520

print(file=fp)

521

print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)

522

for c in extra_casing:

523

print(" %d," % c, file=fp)

print("};", file=fp)

print(file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

527

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

528

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

529

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

530

print("/* type indexes */", file=fp)

531

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

532

Array("index1", index1).dump(fp, trace)

533

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

534

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

535

# Generate code for _PyUnicode_ToNumeric()

536

numeric_items = sorted(numeric.items())

537

print('/* Returns the numeric value as double for Unicode characters', file=fp)

538

print(' * having this property, -1.0 otherwise.', file=fp)

539

print(' */', file=fp)

Amaury Forgeot d'Arc

324ac65

2010-08-18 20:44:58 +0000

[diff] [blame]

540

print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

541

print('{', file=fp)

542

print(' switch (ch) {', file=fp)

543

for value, codepoints in numeric_items:

Amaury Forgeot d'Arc

919765a

2009-10-13 23:18:53 +0000

[diff] [blame]

544

# Turn text into float literals

545

parts = value.split('/')

546

parts = [repr(float(part)) for part in parts]

547

value = '/'.join(parts)

548

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

549

codepoints.sort()

550

for codepoint in codepoints:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

551

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

552

print(' return (double) %s;' % (value,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

553

print(' }', file=fp)

554

print(' return -1.0;', file=fp)

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsWhitespace()

559

print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)

560

print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)

561

print(" */", file=fp)

Antoine Pitrou

9ed5f27

2013-08-13 20:18:52 +0200

[diff] [blame]

562

print('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

563

print('{', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

564

print(' switch (ch) {', file=fp)

565

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

566

for codepoint in sorted(spaces):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

567

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

568

print(' return 1;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

569

570

print(' }', file=fp)

571

print(' return 0;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsLinebreak()

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

576

print("/* Returns 1 for Unicode characters having the line break", file=fp)

577

print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)

578

print(" * type 'B', 0 otherwise.", file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

579

print(" */", file=fp)

Antoine Pitrou

9ed5f27

2013-08-13 20:18:52 +0200

[diff] [blame]

580

print('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

581

print('{', file=fp)

582

print(' switch (ch) {', file=fp)

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

583

for codepoint in sorted(linebreaks):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

584

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

585

print(' return 1;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

586

587

print(' }', file=fp)

588

print(' return 0;', file=fp)

print('}', file=fp)

print(file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

592

fp.close()

593

594

# --------------------------------------------------------------------

595

# unicode name database

596

597

def makeunicodename(unicode, trace):

598

599

FILE = "Modules/unicodename_db.h"

600

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

601

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

602

603

# collect names

604

names = [None] * len(unicode.chars)

605

606

for char in unicode.chars:

607

record = unicode.table[char]

608

if record:

609

name = record[1].strip()

610

if name and name[0] != "<":

611

names[char] = name + chr(0)

612

Jon Dufresne

3972628

2017-05-18 07:35:54 -0700

[diff] [blame]

613

print(len([n for n in names if n is not None]), "distinct names")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

614

615

# collect unique words from names (note that we differ between

616

# words inside a sentence, and words ending a sentence. the

617

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

633

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

634

print(n, "words in text;", b, "bytes")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

635

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

636

wordlist = list(words.items())

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

637

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

638

# sort on falling frequency, then by name

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

639

def word_key(a):

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

640

aword, alist = a

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

641

return -len(alist), aword

642

wordlist.sort(key=word_key)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

643

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

644

# figure out how many phrasebook escapes we need

645

escapes = 0

646

while escapes * 256 < len(wordlist):

647

escapes = escapes + 1

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

648

print(escapes, "escapes")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

649

650

short = 256 - escapes

assert short > 0

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

654

print(short, "short indexes in lexicon")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

655

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

656

# statistics

657

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

658

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

659

n = n + len(wordlist[i][1])

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

660

print(n, "short indexes in phrasebook")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

661

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

662

# pick the most commonly used words, and sort the rest on falling

663

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

664

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

665

wordlist, wordtail = wordlist[:short], wordlist[short:]

Raymond Hettinger

d4cb56d

2008-01-30 02:55:10 +0000

[diff] [blame]

666

wordtail.sort(key=lambda a: a[0], reverse=True)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

667

wordlist.extend(wordtail)

668

669

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

676

offset = 0

677

for w, x in wordlist:

678

# encoding: bit 7 indicates last character in word (chr(128)

679

# indicates the last character in an entire string)

680

ww = w[:-1] + chr(ord(w[-1])+128)

681

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

682

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

683

if o < 0:

684

o = offset

685

lexicon = lexicon + ww

686

offset = offset + len(w)

687

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

688

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

689

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

690

lexicon = list(map(ord, lexicon))

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

691

692

# generate phrasebook from names and lexicon

693

phrasebook = [0]

694

phrasebook_offset = [0] * len(unicode.chars)

695

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

700

for w in w:

701

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

702

if i < short:

703

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

704

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

705

# store as two bytes

706

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

707

phrasebook.append(i&255)

708

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

709

assert getsize(phrasebook) == 1

710

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

711

#

712

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

717

record = unicode.table[char]

718

if record:

719

name = record[1].strip()

720

if name and name[0] != "<":

721

data.append((name, char))

722

723

# the magic number 47 was chosen to minimize the number of

724

# collisions on the current data set. if you like, change it

725

# and see what happens...

726

727

codehash = Hash("code", data, 47)

728

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

729

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

730

731

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

732

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

733

print(file=fp)

734

print("#define NAME_MAXLEN", 256, file=fp)

735

print(file=fp)

736

print("/* lexicon */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

737

Array("lexicon", lexicon).dump(fp, trace)

738

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

739

740

# split decomposition index table

741

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

742

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

743

print("/* code->name phrasebook */", file=fp)

744

print("#define phrasebook_shift", shift, file=fp)

745

print("#define phrasebook_short", short, file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

746

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

747

Array("phrasebook", phrasebook).dump(fp, trace)

748

Array("phrasebook_offset1", offset1).dump(fp, trace)

749

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

750

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

751

print("/* name->code dictionary */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

752

codehash.dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

753

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

754

print(file=fp)

755

print('static const unsigned int aliases_start = %#x;' %

756

NAME_ALIASES_START, file=fp)

757

print('static const unsigned int aliases_end = %#x;' %

758

(NAME_ALIASES_START + len(unicode.aliases)), file=fp)

759

760

print('static const unsigned int name_aliases[] = {', file=fp)

761

for name, codepoint in unicode.aliases:

762

print(' 0x%04X,' % codepoint, file=fp)

763

print('};', file=fp)

764

765

# In Unicode 6.0.0, the sequences contain at most 4 BMP chars,

766

# so we are using Py_UCS2 seq[4]. This needs to be updated if longer

767

# sequences or sequences with non-BMP chars are added.

768

# unicodedata_lookup should be adapted too.

769

print(dedent("""

770

typedef struct NamedSequence {

int seqlen;

Py_UCS2 seq[4];

} named_sequence;

"""), file=fp)

print('static const unsigned int named_sequences_start = %#x;' %

777

NAMED_SEQUENCES_START, file=fp)

778

print('static const unsigned int named_sequences_end = %#x;' %

779

(NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)

780

781

print('static const named_sequence named_sequences[] = {', file=fp)

782

for name, sequence in unicode.named_sequences:

783

seq_str = ', '.join('0x%04X' % cp for cp in sequence)

784

print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)

785

print('};', file=fp)

786

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

787

fp.close()

788

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

789

790

def merge_old_version(version, new, old):

791

# Changes to exclusion file not implemented yet

792

if old.exclusions != new.exclusions:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

793

raise NotImplementedError("exclusions differ")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

794

795

# In these change records, 0xFF means "no change"

796

bidir_changes = [0xFF]*0x110000

797

category_changes = [0xFF]*0x110000

798

decimal_changes = [0xFF]*0x110000

Martin v. Löwis

93cbca3

2008-09-10 14:08:48 +0000

[diff] [blame]

799

mirrored_changes = [0xFF]*0x110000

Benjamin Peterson

6775231

2016-09-14 23:53:47 -0700

[diff] [blame]

800

east_asian_width_changes = [0xFF]*0x110000

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

801

# In numeric data, 0 means "no change",

802

# -1 means "did not have a numeric value

803

numeric_changes = [0] * 0x110000

804

# normalization_changes is a list of key-value pairs

805

normalization_changes = []

806

for i in range(0x110000):

807

if new.table[i] is None:

808

# Characters unassigned in the new version ought to

809

# be unassigned in the old one

810

assert old.table[i] is None

811

continue

812

# check characters unassigned in the old version

813

if old.table[i] is None:

814

# category 0 is "unassigned"

815

category_changes[i] = 0

816

continue

817

# check characters that differ

818

if old.table[i] != new.table[i]:

819

for k in range(len(old.table[i])):

820

if old.table[i][k] != new.table[i][k]:

821

value = old.table[i][k]

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

822

if k == 1 and i in PUA_15:

823

# the name is not set in the old.table, but in the

824

# new.table we are using it for aliases and named seq

825

assert value == ''

826

elif k == 2:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

827

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

828

category_changes[i] = CATEGORY_NAMES.index(value)

829

elif k == 4:

830

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

831

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

832

elif k == 5:

833

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

834

# We assume that all normalization changes are in 1:1 mappings

835

assert " " not in value

836

normalization_changes.append((i, value))

837

elif k == 6:

838

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

839

# we only support changes where the old value is a single digit

840

assert value in "0123456789"

841

decimal_changes[i] = int(value)

842

elif k == 8:

843

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

844

# Since 0 encodes "no change", the old value is better not 0

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

845

if not value:

846

numeric_changes[i] = -1

847

else:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

848

numeric_changes[i] = float(value)

849

assert numeric_changes[i] not in (0, -1)

Martin v. Löwis

93cbca3

2008-09-10 14:08:48 +0000

[diff] [blame]

850

elif k == 9:

851

if value == 'Y':

852

mirrored_changes[i] = '1'

853

else:

854

mirrored_changes[i] = '0'

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

855

elif k == 11:

856

# change to ISO comment, ignore

857

pass

858

elif k == 12:

859

# change to simple uppercase mapping; ignore

860

pass

861

elif k == 13:

862

# change to simple lowercase mapping; ignore

863

pass

864

elif k == 14:

865

# change to simple titlecase mapping; ignore

866

pass

Benjamin Peterson

6775231

2016-09-14 23:53:47 -0700

[diff] [blame]

867

elif k == 15:

868

# change to east asian width

869

east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

870

elif k == 16:

871

# derived property changes; not yet

872

pass

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

873

elif k == 17:

874

# normalization quickchecks are not performed

875

# for older versions

876

pass

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

877

else:

878

class Difference(Exception):pass

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

879

raise Difference(hex(i), k, old.table[i], new.table[i])

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

880

new.changed.append((version, list(zip(bidir_changes, category_changes,

Benjamin Peterson

6775231

2016-09-14 23:53:47 -0700

[diff] [blame]

881

decimal_changes, mirrored_changes,

882

east_asian_width_changes,

883

numeric_changes)),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

884

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

885

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

886

def open_data(template, version):

887

local = template % ('-'+version,)

888

if not os.path.exists(local):

889

import urllib.request

890

if version == '3.2.0':

891

# irregular url structure

892

url = 'http://www.unicode.org/Public/3.2-Update/' + local

893

else:

894

url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')

895

urllib.request.urlretrieve(url, filename=local)

896

if local.endswith('.txt'):

897

return open(local, encoding='utf-8')

898

else:

899

# Unihan.zip

900

return open(local, 'rb')

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

901

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

902

# --------------------------------------------------------------------

903

# the following support code is taken from the unidb utilities

904

905

906

# load a unicode-data file from disk

907

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

908

class UnicodeData:

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

909

# Record structure:

910

# [ID, name, category, combining, bidi, decomp, (6)

911

# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)

912

# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)

913

# derived-props] (17)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

914

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

915

def __init__(self, version,

916

linebreakprops=False,

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

917

expand=1,

918

cjk_check=True):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

919

self.changed = []

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

920

table = [None] * 0x110000

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

921

with open_data(UNICODE_DATA, version) as file:

while 1:

s = file.readline()

if not s:

break

s = s.strip().split(";")

927

char = int(s[0], 16)

928

table[char] = s

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

929

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

930

cjk_ranges_found = []

931

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

932

# expand first-last ranges

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

933

if expand:

934

field = None

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

935

for i in range(0, 0x110000):

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

936

s = table[i]

937

if s:

938

if s[1][-6:] == "First>":

939

s[1] = ""

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

940

field = s

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

941

elif s[1][-5:] == "Last>":

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

942

if s[1].startswith("<CJK Ideograph"):

943

cjk_ranges_found.append((field[0],

944

s[0]))

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

945

s[1] = ""

946

field = None

947

elif field:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

948

f2 = field[:]

949

f2[0] = "%X" % i

950

table[i] = f2

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

951

if cjk_check and cjk_ranges != cjk_ranges_found:

952

raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

953

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

954

# public attributes

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

955

self.filename = UNICODE_DATA % ''

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

956

self.table = table

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

957

self.chars = list(range(0x110000)) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

958

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

959

# check for name aliases and named sequences, see #12753

960

# aliases and named sequences are not in 3.2.0

961

if version != '3.2.0':

962

self.aliases = []

963

# store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,

964

# in order to take advantage of the compression and lookup

965

# algorithms used for the other characters

966

pua_index = NAME_ALIASES_START

967

with open_data(NAME_ALIASES, version) as file:

968

for s in file:

969

s = s.strip()

970

if not s or s.startswith('#'):

971

continue

Benjamin Peterson

71f660e

2012-02-20 22:24:29 -0500

[diff] [blame]

972

char, name, abbrev = s.split(';')

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

973

char = int(char, 16)

974

self.aliases.append((name, char))

975

# also store the name in the PUA 1

976

self.table[pua_index][1] = name

977

pua_index += 1

978

assert pua_index - NAME_ALIASES_START == len(self.aliases)

979

980

self.named_sequences = []

Ezio Melotti

7c4a7e6

2013-08-26 01:32:56 +0300

[diff] [blame]

981

# store named sequences in the PUA 1, in range U+F0100..,

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

982

# in order to take advantage of the compression and lookup

983

# algorithms used for the other characters.

984

Benjamin Peterson

71f660e

2012-02-20 22:24:29 -0500

[diff] [blame]

985

assert pua_index < NAMED_SEQUENCES_START

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

986

pua_index = NAMED_SEQUENCES_START

987

with open_data(NAMED_SEQUENCES, version) as file:

988

for s in file:

989

s = s.strip()

990

if not s or s.startswith('#'):

991

continue

992

name, chars = s.split(';')

993

chars = tuple(int(char, 16) for char in chars.split())

994

# check that the structure defined in makeunicodename is OK

995

assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"

996

assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "

997

"the NamedSequence struct and in unicodedata_lookup")

998

self.named_sequences.append((name, chars))

999

# also store these in the PUA 1

1000

self.table[pua_index][1] = name

1001

pua_index += 1

1002

assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)

1003

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

1004

self.exclusions = {}

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1005

with open_data(COMPOSITION_EXCLUSIONS, version) as file:

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

char = int(s.split()[0],16)

1013

self.exclusions[char] = 1

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

1014

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

1015

widths = [None] * 0x110000

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1016

with open_data(EASTASIAN_WIDTH, version) as file:

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

s = s.split()[0].split(';')

1024

if '..' in s[0]:

1025

first, last = [int(c, 16) for c in s[0].split('..')]

1026

chars = list(range(first, last+1))

1027

else:

1028

chars = [int(s[0], 16)]

for char in chars:

widths[char] = s[1]

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

1032

for i in range(0, 0x110000):

1033

if table[i] is not None:

1034

table[i].append(widths[i])

1035

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1036

for i in range(0, 0x110000):

1037

if table[i] is not None:

1038

table[i].append(set())

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1039

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1040

with open_data(DERIVED_CORE_PROPERTIES, version) as file:

1041

for s in file:

1042

s = s.split('#', 1)[0].strip()

1043

if not s:

1044

continue

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1045

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

r, p = s.split(";")

r = r.strip()

p = p.strip()

if ".." in r:

first, last = [int(c, 16) for c in r.split('..')]

1051

chars = list(range(first, last+1))

else:

chars = [int(r, 16)]

for char in chars:

if table[char]:

# Some properties (e.g. Default_Ignorable_Code_Point)

1057

# apply to unassigned code points; ignore them

1058

table[char][-1].add(p)

1059

1060

with open_data(LINE_BREAK, version) as file:

1061

for s in file:

1062

s = s.partition('#')[0]

1063

s = [i.strip() for i in s.split(';')]

1064

if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:

1065

continue

1066

if '..' not in s[0]:

1067

first = last = int(s[0], 16)

1068

else:

1069

first, last = [int(c, 16) for c in s[0].split('..')]

1070

for char in range(first, last+1):

1071

table[char][-1].add('Line_Break')

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

1072

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1073

# We only want the quickcheck properties

1074

# Format: NF?_QC; Y(es)/N(o)/M(aybe)

1075

# Yes is the default, hence only N and M occur

1076

# In 3.2.0, the format was different (NF?_NO)

1077

# The parsing will incorrectly determine these as

1078

# "yes", however, unicodedata.c will not perform quickchecks

1079

# for older versions, and no delta records will be created.

1080

quickchecks = [0] * 0x110000

1081

qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1082

with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:

for s in file:

if '#' in s:

s = s[:s.index('#')]

s = [i.strip() for i in s.split(';')]

1087

if len(s) < 2 or s[1] not in qc_order:

1088

continue

1089

quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No

1090

quickcheck_shift = qc_order.index(s[1])*2

1091

quickcheck <<= quickcheck_shift

1092

if '..' not in s[0]:

1093

first = last = int(s[0], 16)

1094

else:

1095

first, last = [int(c, 16) for c in s[0].split('..')]

1096

for char in range(first, last+1):

1097

assert not (quickchecks[char]>>quickcheck_shift)&3

1098

quickchecks[char] |= quickcheck

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1099

for i in range(0, 0x110000):

1100

if table[i] is not None:

1101

table[i].append(quickchecks[i])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

1102

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1103

with open_data(UNIHAN, version) as file:

1104

zip = zipfile.ZipFile(file)

1105

if version == '3.2.0':

1106

data = zip.open('Unihan-3.2.0.txt').read()

1107

else:

1108

data = zip.open('Unihan_NumericValues.txt').read()

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1109

for line in data.decode("utf-8").splitlines():

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

1110

if not line.startswith('U+'):

1111

continue

1112

code, tag, value = line.split(None, 3)[:3]

1113

if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',

1114

'kOtherNumeric'):

1115

continue

1116

value = value.strip().replace(',', '')

1117

i = int(code[2:], 16)

1118

# Patch the numeric field

1119

if table[i] is not None:

1120

table[i][8] = value

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

1121

sc = self.special_casing = {}

1122

with open_data(SPECIAL_CASING, version) as file:

1123

for s in file:

1124

s = s[:-1].split('#', 1)[0]

if not s:

continue

data = s.split("; ")

if data[4]:

# We ignore all conditionals (since they depend on

1130

# languages) except for one, which is hardcoded. See

1131

# handle_capital_sigma in unicodeobject.c.

1132

continue

1133

c = int(data[0], 16)

1134

lower = [int(char, 16) for char in data[1].split()]

1135

title = [int(char, 16) for char in data[2].split()]

1136

upper = [int(char, 16) for char in data[3].split()]

1137

sc[c] = (lower, title, upper)

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

1138

cf = self.case_folding = {}

1139

if version != '3.2.0':

1140

with open_data(CASE_FOLDING, version) as file:

1141

for s in file:

1142

s = s[:-1].split('#', 1)[0]

if not s:

continue

data = s.split("; ")

if data[1] in "CF":

c = int(data[0], 16)

cf[c] = [int(char, 16) for char in data[2].split()]

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

1149

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1150

def uselatin1(self):

1151

# restrict character range to ISO Latin 1

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

1152

self.chars = list(range(256))

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1153

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1154

# hash table tools

1155

1156

# this is a straight-forward reimplementation of Python's built-in

1157

# dictionary type, using a static data structure, and a custom string

1158

# hash algorithm.

1159

1160

def myhash(s, magic):

1161

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1162

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1163

h = (h * magic) + c

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

1164

ix = h & 0xff000000

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1165

if ix:

1166

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

return h

SIZES = [

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

1171

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

1172

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

1173

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

]

class Hash:

def __init__(self, name, data, magic):

1178

# turn a (key, value) list into a static hash table structure

1179

1180

# determine table size

1181

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

Ezio Melotti

1392500

2011-03-16 11:05:33 +0200

[diff] [blame]

1186

raise AssertionError("ran out of polynomials")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1187

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1188

print(size, "slots in hash table")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1189

1190

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

1199

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

incr = (h ^ (h >> 3)) & mask;

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1220

print(n, "collisions")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1221

self.collisions = n

1222

1223

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1233

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1234

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1235

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1236

file.write("#define %s_magic %d\n" % (self.name, self.magic))

1237

file.write("#define %s_size %d\n" % (self.name, self.size))

1238

file.write("#define %s_poly %d\n" % (self.name, self.poly))

1239

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1240

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1248

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1249

# write data to file, as a C array

1250

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1251

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1252

print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1253

file.write("static ")

1254

if size == 1:

1255

file.write("unsigned char")

1256

elif size == 2:

1257

file.write("unsigned short")

1258

else:

1259

file.write("unsigned int")

1260

file.write(" " + self.name + "[] = {\n")

1261

if self.data:

1262

s = " "

1263

for item in self.data:

1264

i = str(item) + ", "

1265

if len(s) + len(i) > 78:

Benjamin Peterson

2017-06-22 22:31:08 -0700

[diff] [blame]

1266

file.write(s.rstrip() + "\n")

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1267

s = " " + i

1268

else:

1269

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1270

if s.strip():

Benjamin Peterson

2017-06-22 22:31:08 -0700

[diff] [blame]

1271

file.write(s.rstrip() + "\n")

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

file.write("};\n\n")

def getsize(data):

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1284

def splitbins(t, trace=0):

1285

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

1286

1287

t is a sequence of ints. This function can be useful to save space if

1288

many of the ints are the same. t1 and t2 are lists of ints, and shift

1289

is an int, chosen to minimize the combined size of t1 and t2 (in C

1290

code), and where for each i in range(len(t)),

1291

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1292

where mask is a bitmask isolating the last "shift" bits.

1293

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1294

If optional arg trace is non-zero (default zero), progress info

1295

is printed to sys.stderr. The higher the value, the more info

1296

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1297

"""

1298

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1299

if trace:

1300

def dump(t1, t2, shift, bytes):

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1301

print("%d+%d bins at shift %d; %d bytes" % (

1302

len(t1), len(t2), shift, bytes), file=sys.stderr)

1303

print("Size of original table:", len(t)*getsize(t), \

1304

"bytes", file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1305

n = len(t)-1 # last valid index

1306

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

Christian Heimes

a37d4c6

2007-12-04 23:02:19 +0000

[diff] [blame]

1312

bytes = sys.maxsize # smallest total size so far

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1313

t = tuple(t) # so slices can be dict keys

1314

for shift in range(maxshift + 1):

1315

t1 = []

1316

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1317

size = 2**shift

1318

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1319

for i in range(0, len(t), size):

1320

bin = t[i:i+size]

1321

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1322

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1323

index = len(t2)

1324

bincache[bin] = index

1325

t2.extend(bin)

1326

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1327

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1328

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1329

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1330

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1331

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1332

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1333

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1334

t1, t2, shift = best

1335

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1336

print("Best:", end=' ', file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1337

dump(t1, t2, shift, bytes)

1338

if __debug__:

1339

# exhaustively verify that the decomposition is correct

1340

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

Guido van Rossum

805365e

2007-05-07 22:24:25 +0000

[diff] [blame]

1341

for i in range(len(t)):

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1342

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1343

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1344

1345

if __name__ == "__main__":

Fredrik Lundh