Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython3

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

4

# this script converts a unicode 3.2 database file to

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

5

# Modules/unicodedata_db.h, Modules/unicodename_db.h,

6

# and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

7

#

8

# history:

9

# 2000-09-24 fl created (based on bits and pieces from unidb)

10

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

11

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

12

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

13

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

14

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

15

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

16

# 2002-09-11 wd use string methods

17

# 2002-10-18 mvl update to Unicode 3.2

18

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

19

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

20

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

21

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

22

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

23

# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

24

# 2011-10-21 ezio add support for name aliases and named sequences

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

25

# 2012-01 benjamin add full case mappings

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

26

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

27

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

28

#

29

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

import os

import sys

import zipfile

from textwrap import dedent

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

35

36

SCRIPT = sys.argv[0]

Amaury Forgeot d'Arc

324ac65

2010-08-18 20:44:58 +0000

[diff] [blame]

37

VERSION = "3.2"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

38

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

39

# The Unicode Database

R David Murray

7445a38

2014-10-09 17:30:33 -0400

[diff] [blame]

40

# --------------------

41

# When changing UCD version please update

42

# * Doc/library/stdtypes.rst, and

43

# * Doc/library/unicodedata.rst

R David Murray

5f16f90

2014-10-09 20:45:59 -0400

[diff] [blame]

44

# * Doc/reference/lexical_analysis.rst (two occurrences)

Benjamin Peterson

2016-09-14 23:53:47 -0700

[diff] [blame^]

45

UNIDATA_VERSION = "9.0.0"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

46

UNICODE_DATA = "UnicodeData%s.txt"

47

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

48

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

49

UNIHAN = "Unihan%s.zip"

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

50

DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

51

DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

52

LINE_BREAK = "LineBreak%s.txt"

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

53

NAME_ALIASES = "NameAliases%s.txt"

54

NAMED_SEQUENCES = "NamedSequences%s.txt"

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

55

SPECIAL_CASING = "SpecialCasing%s.txt"

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

56

CASE_FOLDING = "CaseFolding%s.txt"

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

57

58

# Private Use Areas -- in planes 1, 15, 16

59

PUA_1 = range(0xE000, 0xF900)

60

PUA_15 = range(0xF0000, 0xFFFFE)

61

PUA_16 = range(0x100000, 0x10FFFE)

62

63

# we use this ranges of PUA_15 to store name aliases and named sequences

64

NAME_ALIASES_START = 0xF0000

Benjamin Peterson

71f660e

2012-02-20 22:24:29 -0500

[diff] [blame]

65

NAMED_SEQUENCES_START = 0xF0200

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

66

67

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

68

69

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

70

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

71

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

72

"So" ]

73

74

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

75

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

Benjamin Peterson

94d08d9

2013-10-10 17:24:45 -0400

[diff] [blame]

76

"ON", "LRI", "RLI", "FSI", "PDI" ]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

77

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

78

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

79

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

80

MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]

81

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

82

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

87

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

88

SPACE_MASK = 0x20

89

TITLE_MASK = 0x40

90

UPPER_MASK = 0x80

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

91

XID_START_MASK = 0x100

92

XID_CONTINUE_MASK = 0x200

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

93

PRINTABLE_MASK = 0x400

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

94

NUMERIC_MASK = 0x800

95

CASE_IGNORABLE_MASK = 0x1000

96

CASED_MASK = 0x2000

97

EXTENDED_CASE_MASK = 0x4000

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

98

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

99

# these ranges need to match unicodedata.c:is_unified_ideograph

100

cjk_ranges = [

101

('3400', '4DB5'),

Benjamin Peterson

4801383

2015-06-27 15:45:56 -0500

[diff] [blame]

102

('4E00', '9FD5'),

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

103

('20000', '2A6D6'),

104

('2A700', '2B734'),

Benjamin Peterson

4801383

2015-06-27 15:45:56 -0500

[diff] [blame]

105

('2B740', '2B81D'),

106

('2B820', '2CEA1'),

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

107

]

108

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

109

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

110

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

111

print("--- Reading", UNICODE_DATA % "", "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

112

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

113

version = ""

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

114

unicode = UnicodeData(UNIDATA_VERSION)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

115

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

116

print(len(list(filter(None, unicode.table))), "characters")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

117

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

118

for version in old_versions:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

119

print("--- Reading", UNICODE_DATA % ("-"+version), "...")

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

120

old_unicode = UnicodeData(version, cjk_check=False)

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

121

print(len(list(filter(None, old_unicode.table))), "characters")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

122

merge_old_version(version, unicode, old_unicode)

123

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

124

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

125

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

126

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

127

128

# --------------------------------------------------------------------

129

# unicode character properties

130

131

def makeunicodedata(unicode, trace):

132

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

133

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

134

table = [dummy]

135

cache = {0: dummy}

136

index = [0] * len(unicode.chars)

137

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

138

FILE = "Modules/unicodedata_db.h"

139

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

140

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

141

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

142

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

143

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

144

for char in unicode.chars:

145

record = unicode.table[char]

146

if record:

147

# extract database properties

148

category = CATEGORY_NAMES.index(record[2])

149

combining = int(record[3])

150

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

151

mirrored = record[9] == "Y"

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

152

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

153

normalizationquickcheck = record[17]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

154

item = (

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

155

category, combining, bidirectional, mirrored, eastasianwidth,

156

normalizationquickcheck

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

157

)

158

# add entry to index and item tables

159

i = cache.get(item)

160

if i is None:

161

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

165

# 2) decomposition data

166

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

167

decomp_data = [0]

168

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

169

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

170

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

171

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

172

comp_pairs = []

173

comp_first = [None] * len(unicode.chars)

174

comp_last = [None] * len(unicode.chars)

175

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

176

for char in unicode.chars:

177

record = unicode.table[char]

178

if record:

179

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

180

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

181

if len(decomp) > 19:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

182

raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

183

# prefix

184

if decomp[0][0] == "<":

185

prefix = decomp.pop(0)

186

else:

187

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

188

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

189

i = decomp_prefix.index(prefix)

190

except ValueError:

191

i = len(decomp_prefix)

192

decomp_prefix.append(prefix)

193

prefix = i

194

assert prefix < 256

195

# content

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

196

decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

197

# Collect NFC pairs

198

if not prefix and len(decomp) == 3 and \

199

char not in unicode.exclusions and \

200

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

205

try:

206

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

207

except ValueError:

208

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

209

decomp_data.extend(decomp)

210

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

211

else:

212

i = 0

213

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

214

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

215

f = l = 0

216

comp_first_ranges = []

217

comp_last_ranges = []

218

prev_f = prev_l = None

219

for i in unicode.chars:

220

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

226

prev_f = prev_f[0],i

227

else:

228

comp_first_ranges.append(prev_f)

229

prev_f = (i,i)

230

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

236

prev_l = prev_l[0],i

237

else:

238

comp_last_ranges.append(prev_l)

239

prev_l = (i,i)

240

comp_first_ranges.append(prev_f)

241

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

246

for f,l,char in comp_pairs:

247

f = comp_first[f]

248

l = comp_last[l]

249

comp_data[f*total_last+l] = char

250

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

251

print(len(table), "unique properties")

252

print(len(decomp_prefix), "unique decomposition prefixes")

253

print(len(decomp_data), "unique decomposition entries:", end=' ')

254

print(decomp_size, "bytes")

255

print(total_first, "first characters in NFC")

256

print(total_last, "last characters in NFC")

257

print(len(comp_pairs), "NFC pairs")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

258

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

259

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

260

Fred Drake

9c68505

2000-10-26 03:56:46 +0000

[diff] [blame]

261

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

262

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

263

print(file=fp)

264

print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)

265

print("/* a list of unique database records */", file=fp)

266

print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

267

for item in table:

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

268

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

269

print("};", file=fp)

270

print(file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

271

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

272

print("/* Reindexing of NFC first characters. */", file=fp)

273

print("#define TOTAL_FIRST",total_first, file=fp)

274

print("#define TOTAL_LAST",total_last, file=fp)

275

print("struct reindex{int start;short count,index;};", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

276

print("static struct reindex nfc_first[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

277

for start,end in comp_first_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

278

print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)

279

print(" {0,0,0}", file=fp)

280

print("};\n", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

281

print("static struct reindex nfc_last[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

282

for start,end in comp_last_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

283

print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)

284

print(" {0,0,0}", file=fp)

285

print("};\n", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

286

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

287

# FIXME: <fl> the following tables could be made static, and

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

288

# the support code moved into unicodedatabase.c

289

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

290

print("/* string literals */", file=fp)

291

print("const char *_PyUnicode_CategoryNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

292

for name in CATEGORY_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

293

print(" \"%s\"," % name, file=fp)

294

print(" NULL", file=fp)

295

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

296

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

297

print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

298

for name in BIDIRECTIONAL_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

299

print(" \"%s\"," % name, file=fp)

300

print(" NULL", file=fp)

301

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

302

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

303

print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

304

for name in EASTASIANWIDTH_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

305

print(" \"%s\"," % name, file=fp)

306

print(" NULL", file=fp)

307

print("};", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

308

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

309

print("static const char *decomp_prefix[] = {", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

310

for name in decomp_prefix:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

311

print(" \"%s\"," % name, file=fp)

312

print(" NULL", file=fp)

313

print("};", file=fp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

314

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

315

# split record index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

316

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

317

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

318

print("/* index tables for the database records */", file=fp)

319

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

320

Array("index1", index1).dump(fp, trace)

321

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

322

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

323

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

324

index1, index2, shift = splitbins(decomp_index, trace)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

325

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

326

print("/* decomposition data */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

327

Array("decomp_data", decomp_data).dump(fp, trace)

328

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

329

print("/* index tables for the decomposition data */", file=fp)

330

print("#define DECOMP_SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

331

Array("decomp_index1", index1).dump(fp, trace)

332

Array("decomp_index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

333

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

334

index, index2, shift = splitbins(comp_data, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

335

print("/* NFC pairs */", file=fp)

336

print("#define COMP_SHIFT", shift, file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

337

Array("comp_index", index).dump(fp, trace)

338

Array("comp_data", index2).dump(fp, trace)

339

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

340

# Generate delta tables for old versions

341

for version, table, normalization in unicode.changed:

342

cversion = version.replace(".","_")

343

records = [table[0]]

344

cache = {table[0]:0}

345

index = [0] * len(table)

346

for i, record in enumerate(table):

347

try:

348

index[i] = cache[record]

349

except KeyError:

350

index[i] = cache[record] = len(records)

351

records.append(record)

352

index1, index2, shift = splitbins(index, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

353

print("static const change_record change_records_%s[] = {" % cversion, file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

354

for record in records:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

355

print("\t{ %s }," % ", ".join(map(str,record)), file=fp)

356

print("};", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

357

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

358

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

359

print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)

360

print("{", file=fp)

361

print("\tint index;", file=fp)

362

print("\tif (n >= 0x110000) index = 0;", file=fp)

363

print("\telse {", file=fp)

364

print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)

365

print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \

366

(cversion, shift, ((1<<shift)-1)), file=fp)

367

print("\t}", file=fp)

368

print("\treturn change_records_%s+index;" % cversion, file=fp)

369

print("}\n", file=fp)

370

print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)

371

print("{", file=fp)

372

print("\tswitch(n) {", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

373

for k, v in normalization:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

374

print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)

375

print("\tdefault: return 0;", file=fp)

376

print("\t}\n}\n", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

377

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

378

fp.close()

379

380

# --------------------------------------------------------------------

381

# unicode character type tables

382

383

def makeunicodetype(unicode, trace):

384

385

FILE = "Objects/unicodetype_db.h"

386

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

387

print("--- Preparing", FILE, "...")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

388

389

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

390

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

391

table = [dummy]

392

cache = {0: dummy}

393

index = [0] * len(unicode.chars)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

394

numeric = {}

395

spaces = []

396

linebreaks = []

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

397

extra_casing = []

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

398

399

for char in unicode.chars:

400

record = unicode.table[char]

401

if record:

402

# extract database properties

403

category = record[2]

404

bidirectional = record[4]

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

405

properties = record[16]

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

406

flags = 0

Martin v. Löwis

93cbca3

2008-09-10 14:08:48 +0000

[diff] [blame]

407

delta = True

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

408

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

409

flags |= ALPHA_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

410

if "Lowercase" in properties:

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

411

flags |= LOWER_MASK

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

412

if 'Line_Break' in properties or bidirectional == "B":

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

413

flags |= LINEBREAK_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

414

linebreaks.append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

415

if category == "Zs" or bidirectional in ("WS", "B", "S"):

416

flags |= SPACE_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

417

spaces.append(char)

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

418

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

419

flags |= TITLE_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

420

if "Uppercase" in properties:

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

421

flags |= UPPER_MASK

Benjamin Peterson

0983274

2009-03-26 17:15:46 +0000

[diff] [blame]

422

if char == ord(" ") or category[0] not in ("C", "Z"):

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

423

flags |= PRINTABLE_MASK

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

424

if "XID_Start" in properties:

425

flags |= XID_START_MASK

426

if "XID_Continue" in properties:

427

flags |= XID_CONTINUE_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

428

if "Cased" in properties:

429

flags |= CASED_MASK

430

if "Case_Ignorable" in properties:

431

flags |= CASE_IGNORABLE_MASK

432

sc = unicode.special_casing.get(char)

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

433

cf = unicode.case_folding.get(char, [char])

434

if record[12]:

435

upper = int(record[12], 16)

else:

upper = char

if record[13]:

lower = int(record[13], 16)

else:

lower = char

if record[14]:

title = int(record[14], 16)

444

else:

445

title = upper

446

if sc is None and cf != [lower]:

447

sc = ([lower], [title], [upper])

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

448

if sc is None:

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

449

if upper == lower == title:

450

upper = lower = title = 0

Benjamin Peterson

ad9c569

2012-01-15 21:19:20 -0500

[diff] [blame]

else:

upper = upper - char

lower = lower - char

title = title - char

assert (abs(upper) <= 2147483647 and

456

abs(lower) <= 2147483647 and

457

abs(title) <= 2147483647)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

458

else:

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

459

# This happens either when some character maps to more than one

460

# character in uppercase, lowercase, or titlecase or the

461

# casefolded version of the character is different from the

462

# lowercase. The extra characters are stored in a different

463

# array.

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

464

flags |= EXTENDED_CASE_MASK

465

lower = len(extra_casing) | (len(sc[0]) << 24)

466

extra_casing.extend(sc[0])

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

467

if cf != sc[0]:

468

lower |= len(cf) << 20

469

extra_casing.extend(cf)

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

470

upper = len(extra_casing) | (len(sc[2]) << 24)

471

extra_casing.extend(sc[2])

472

# Title is probably equal to upper.

if sc[1] == sc[2]:

title = upper

else:

title = len(extra_casing) | (len(sc[1]) << 24)

477

extra_casing.extend(sc[1])

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

478

# decimal digit, integer digit

479

decimal = 0

480

if record[6]:

481

flags |= DECIMAL_MASK

482

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

487

if record[8]:

488

flags |= NUMERIC_MASK

489

numeric.setdefault(record[8], []).append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

490

item = (

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

491

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

492

)

493

# add entry to index and item tables

494

i = cache.get(item)

495

if i is None:

496

cache[item] = i = len(table)

table.append(item)

index[char] = i

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

500

print(len(table), "unique character type entries")

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

501

print(sum(map(len, numeric.values())), "numeric code points")

502

print(len(spaces), "whitespace code points")

503

print(len(linebreaks), "linebreak code points")

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

504

print(len(extra_casing), "extended case array")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

505

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

506

print("--- Writing", FILE, "...")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

507

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

508

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

509

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

510

print(file=fp)

511

print("/* a list of unique character type descriptors */", file=fp)

512

print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

513

for item in table:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

514

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

515

print("};", file=fp)

516

print(file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

517

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

518

print("/* extended case mappings */", file=fp)

519

print(file=fp)

520

print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)

521

for c in extra_casing:

522

print(" %d," % c, file=fp)

print("};", file=fp)

print(file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

526

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

527

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

528

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

529

print("/* type indexes */", file=fp)

530

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

531

Array("index1", index1).dump(fp, trace)

532

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

533

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

534

# Generate code for _PyUnicode_ToNumeric()

535

numeric_items = sorted(numeric.items())

536

print('/* Returns the numeric value as double for Unicode characters', file=fp)

537

print(' * having this property, -1.0 otherwise.', file=fp)

538

print(' */', file=fp)

Amaury Forgeot d'Arc

324ac65

2010-08-18 20:44:58 +0000

[diff] [blame]

539

print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

540

print('{', file=fp)

541

print(' switch (ch) {', file=fp)

542

for value, codepoints in numeric_items:

Amaury Forgeot d'Arc

919765a

2009-10-13 23:18:53 +0000

[diff] [blame]

543

# Turn text into float literals

544

parts = value.split('/')

545

parts = [repr(float(part)) for part in parts]

546

value = '/'.join(parts)

547

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

548

codepoints.sort()

549

for codepoint in codepoints:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

550

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

551

print(' return (double) %s;' % (value,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

552

print(' }', file=fp)

553

print(' return -1.0;', file=fp)

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsWhitespace()

558

print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)

559

print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)

560

print(" */", file=fp)

Antoine Pitrou

9ed5f27

2013-08-13 20:18:52 +0200

[diff] [blame]

561

print('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

562

print('{', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

563

print(' switch (ch) {', file=fp)

564

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

565

for codepoint in sorted(spaces):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

566

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

567

print(' return 1;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

568

569

print(' }', file=fp)

570

print(' return 0;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsLinebreak()

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

575

print("/* Returns 1 for Unicode characters having the line break", file=fp)

576

print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)

577

print(" * type 'B', 0 otherwise.", file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

578

print(" */", file=fp)

Antoine Pitrou

9ed5f27

2013-08-13 20:18:52 +0200

[diff] [blame]

579

print('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

580

print('{', file=fp)

581

print(' switch (ch) {', file=fp)

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

582

for codepoint in sorted(linebreaks):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

583

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

584

print(' return 1;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

585

586

print(' }', file=fp)

587

print(' return 0;', file=fp)

print('}', file=fp)

print(file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

591

fp.close()

592

593

# --------------------------------------------------------------------

594

# unicode name database

595

596

def makeunicodename(unicode, trace):

597

598

FILE = "Modules/unicodename_db.h"

599

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

600

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

601

602

# collect names

603

names = [None] * len(unicode.chars)

604

605

for char in unicode.chars:

606

record = unicode.table[char]

607

if record:

608

name = record[1].strip()

609

if name and name[0] != "<":

610

names[char] = name + chr(0)

611

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

612

print(len(list(n for n in names if n is not None)), "distinct names")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

613

614

# collect unique words from names (note that we differ between

615

# words inside a sentence, and words ending a sentence. the

616

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

632

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

633

print(n, "words in text;", b, "bytes")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

634

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

635

wordlist = list(words.items())

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

636

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

637

# sort on falling frequency, then by name

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

638

def word_key(a):

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

639

aword, alist = a

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

640

return -len(alist), aword

641

wordlist.sort(key=word_key)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

642

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

643

# figure out how many phrasebook escapes we need

644

escapes = 0

645

while escapes * 256 < len(wordlist):

646

escapes = escapes + 1

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

647

print(escapes, "escapes")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

648

649

short = 256 - escapes

assert short > 0

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

653

print(short, "short indexes in lexicon")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

654

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

655

# statistics

656

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

657

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

658

n = n + len(wordlist[i][1])

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

659

print(n, "short indexes in phrasebook")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

660

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

661

# pick the most commonly used words, and sort the rest on falling

662

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

663

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

664

wordlist, wordtail = wordlist[:short], wordlist[short:]

Raymond Hettinger

d4cb56d

2008-01-30 02:55:10 +0000

[diff] [blame]

665

wordtail.sort(key=lambda a: a[0], reverse=True)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

666

wordlist.extend(wordtail)

667

668

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

675

offset = 0

676

for w, x in wordlist:

677

# encoding: bit 7 indicates last character in word (chr(128)

678

# indicates the last character in an entire string)

679

ww = w[:-1] + chr(ord(w[-1])+128)

680

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

681

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

682

if o < 0:

683

o = offset

684

lexicon = lexicon + ww

685

offset = offset + len(w)

686

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

687

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

688

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

689

lexicon = list(map(ord, lexicon))

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

690

691

# generate phrasebook from names and lexicon

692

phrasebook = [0]

693

phrasebook_offset = [0] * len(unicode.chars)

694

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

699

for w in w:

700

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

701

if i < short:

702

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

703

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

704

# store as two bytes

705

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

706

phrasebook.append(i&255)

707

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

708

assert getsize(phrasebook) == 1

709

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

710

#

711

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

716

record = unicode.table[char]

717

if record:

718

name = record[1].strip()

719

if name and name[0] != "<":

720

data.append((name, char))

721

722

# the magic number 47 was chosen to minimize the number of

723

# collisions on the current data set. if you like, change it

724

# and see what happens...

725

726

codehash = Hash("code", data, 47)

727

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

728

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

729

730

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

731

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

732

print(file=fp)

733

print("#define NAME_MAXLEN", 256, file=fp)

734

print(file=fp)

735

print("/* lexicon */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

736

Array("lexicon", lexicon).dump(fp, trace)

737

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

738

739

# split decomposition index table

740

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

741

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

742

print("/* code->name phrasebook */", file=fp)

743

print("#define phrasebook_shift", shift, file=fp)

744

print("#define phrasebook_short", short, file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

745

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

746

Array("phrasebook", phrasebook).dump(fp, trace)

747

Array("phrasebook_offset1", offset1).dump(fp, trace)

748

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

749

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

750

print("/* name->code dictionary */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

751

codehash.dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

752

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

753

print(file=fp)

754

print('static const unsigned int aliases_start = %#x;' %

755

NAME_ALIASES_START, file=fp)

756

print('static const unsigned int aliases_end = %#x;' %

757

(NAME_ALIASES_START + len(unicode.aliases)), file=fp)

758

759

print('static const unsigned int name_aliases[] = {', file=fp)

760

for name, codepoint in unicode.aliases:

761

print(' 0x%04X,' % codepoint, file=fp)

762

print('};', file=fp)

763

764

# In Unicode 6.0.0, the sequences contain at most 4 BMP chars,

765

# so we are using Py_UCS2 seq[4]. This needs to be updated if longer

766

# sequences or sequences with non-BMP chars are added.

767

# unicodedata_lookup should be adapted too.

768

print(dedent("""

769

typedef struct NamedSequence {

int seqlen;

Py_UCS2 seq[4];

} named_sequence;

"""), file=fp)

print('static const unsigned int named_sequences_start = %#x;' %

776

NAMED_SEQUENCES_START, file=fp)

777

print('static const unsigned int named_sequences_end = %#x;' %

778

(NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)

779

780

print('static const named_sequence named_sequences[] = {', file=fp)

781

for name, sequence in unicode.named_sequences:

782

seq_str = ', '.join('0x%04X' % cp for cp in sequence)

783

print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)

784

print('};', file=fp)

785

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

786

fp.close()

787

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

788

789

def merge_old_version(version, new, old):

790

# Changes to exclusion file not implemented yet

791

if old.exclusions != new.exclusions:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

792

raise NotImplementedError("exclusions differ")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

793

794

# In these change records, 0xFF means "no change"

795

bidir_changes = [0xFF]*0x110000

796

category_changes = [0xFF]*0x110000

797

decimal_changes = [0xFF]*0x110000

Martin v. Löwis

93cbca3

2008-09-10 14:08:48 +0000

[diff] [blame]

798

mirrored_changes = [0xFF]*0x110000

Benjamin Peterson

2016-09-14 23:53:47 -0700

[diff] [blame^]

799

east_asian_width_changes = [0xFF]*0x110000

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

800

# In numeric data, 0 means "no change",

801

# -1 means "did not have a numeric value

802

numeric_changes = [0] * 0x110000

803

# normalization_changes is a list of key-value pairs

804

normalization_changes = []

805

for i in range(0x110000):

806

if new.table[i] is None:

807

# Characters unassigned in the new version ought to

808

# be unassigned in the old one

809

assert old.table[i] is None

810

continue

811

# check characters unassigned in the old version

812

if old.table[i] is None:

813

# category 0 is "unassigned"

814

category_changes[i] = 0

815

continue

816

# check characters that differ

817

if old.table[i] != new.table[i]:

818

for k in range(len(old.table[i])):

819

if old.table[i][k] != new.table[i][k]:

820

value = old.table[i][k]

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

821

if k == 1 and i in PUA_15:

822

# the name is not set in the old.table, but in the

823

# new.table we are using it for aliases and named seq

824

assert value == ''

825

elif k == 2:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

826

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

827

category_changes[i] = CATEGORY_NAMES.index(value)

828

elif k == 4:

829

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

830

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

831

elif k == 5:

832

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

833

# We assume that all normalization changes are in 1:1 mappings

834

assert " " not in value

835

normalization_changes.append((i, value))

836

elif k == 6:

837

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

838

# we only support changes where the old value is a single digit

839

assert value in "0123456789"

840

decimal_changes[i] = int(value)

841

elif k == 8:

842

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

843

# Since 0 encodes "no change", the old value is better not 0

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

844

if not value:

845

numeric_changes[i] = -1

846

else:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

847

numeric_changes[i] = float(value)

848

assert numeric_changes[i] not in (0, -1)

Martin v. Löwis

93cbca3

2008-09-10 14:08:48 +0000

[diff] [blame]

849

elif k == 9:

850

if value == 'Y':

851

mirrored_changes[i] = '1'

852

else:

853

mirrored_changes[i] = '0'

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

854

elif k == 11:

855

# change to ISO comment, ignore

856

pass

857

elif k == 12:

858

# change to simple uppercase mapping; ignore

859

pass

860

elif k == 13:

861

# change to simple lowercase mapping; ignore

862

pass

863

elif k == 14:

864

# change to simple titlecase mapping; ignore

865

pass

Benjamin Peterson

2016-09-14 23:53:47 -0700

[diff] [blame^]

866

elif k == 15:

867

# change to east asian width

868

east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

869

elif k == 16:

870

# derived property changes; not yet

871

pass

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

872

elif k == 17:

873

# normalization quickchecks are not performed

874

# for older versions

875

pass

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

876

else:

877

class Difference(Exception):pass

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

878

raise Difference(hex(i), k, old.table[i], new.table[i])

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

879

new.changed.append((version, list(zip(bidir_changes, category_changes,

Benjamin Peterson

2016-09-14 23:53:47 -0700

[diff] [blame^]

880

decimal_changes, mirrored_changes,

881

east_asian_width_changes,

882

numeric_changes)),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

883

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

884

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

885

def open_data(template, version):

886

local = template % ('-'+version,)

887

if not os.path.exists(local):

888

import urllib.request

889

if version == '3.2.0':

890

# irregular url structure

891

url = 'http://www.unicode.org/Public/3.2-Update/' + local

892

else:

893

url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')

894

urllib.request.urlretrieve(url, filename=local)

895

if local.endswith('.txt'):

896

return open(local, encoding='utf-8')

897

else:

898

# Unihan.zip

899

return open(local, 'rb')

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

900

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

901

# --------------------------------------------------------------------

902

# the following support code is taken from the unidb utilities

903

904

905

# load a unicode-data file from disk

906

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

907

class UnicodeData:

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

908

# Record structure:

909

# [ID, name, category, combining, bidi, decomp, (6)

910

# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)

911

# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)

912

# derived-props] (17)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

913

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

914

def __init__(self, version,

915

linebreakprops=False,

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

916

expand=1,

917

cjk_check=True):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

918

self.changed = []

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

919

table = [None] * 0x110000

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

920

with open_data(UNICODE_DATA, version) as file:

while 1:

s = file.readline()

if not s:

break

s = s.strip().split(";")

926

char = int(s[0], 16)

927

table[char] = s

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

928

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

929

cjk_ranges_found = []

930

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

931

# expand first-last ranges

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

932

if expand:

933

field = None

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

934

for i in range(0, 0x110000):

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

935

s = table[i]

936

if s:

937

if s[1][-6:] == "First>":

938

s[1] = ""

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

939

field = s

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

940

elif s[1][-5:] == "Last>":

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

941

if s[1].startswith("<CJK Ideograph"):

942

cjk_ranges_found.append((field[0],

943

s[0]))

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

944

s[1] = ""

945

field = None

946

elif field:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

947

f2 = field[:]

948

f2[0] = "%X" % i

949

table[i] = f2

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

950

if cjk_check and cjk_ranges != cjk_ranges_found:

951

raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

952

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

953

# public attributes

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

954

self.filename = UNICODE_DATA % ''

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

955

self.table = table

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

956

self.chars = list(range(0x110000)) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

957

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

958

# check for name aliases and named sequences, see #12753

959

# aliases and named sequences are not in 3.2.0

960

if version != '3.2.0':

961

self.aliases = []

962

# store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,

963

# in order to take advantage of the compression and lookup

964

# algorithms used for the other characters

965

pua_index = NAME_ALIASES_START

966

with open_data(NAME_ALIASES, version) as file:

967

for s in file:

968

s = s.strip()

969

if not s or s.startswith('#'):

970

continue

Benjamin Peterson

71f660e

2012-02-20 22:24:29 -0500

[diff] [blame]

971

char, name, abbrev = s.split(';')

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

972

char = int(char, 16)

973

self.aliases.append((name, char))

974

# also store the name in the PUA 1

975

self.table[pua_index][1] = name

976

pua_index += 1

977

assert pua_index - NAME_ALIASES_START == len(self.aliases)

978

979

self.named_sequences = []

Ezio Melotti

7c4a7e6

2013-08-26 01:32:56 +0300

[diff] [blame]

980

# store named sequences in the PUA 1, in range U+F0100..,

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

981

# in order to take advantage of the compression and lookup

982

# algorithms used for the other characters.

983

Benjamin Peterson

71f660e

2012-02-20 22:24:29 -0500

[diff] [blame]

984

assert pua_index < NAMED_SEQUENCES_START

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

985

pua_index = NAMED_SEQUENCES_START

986

with open_data(NAMED_SEQUENCES, version) as file:

987

for s in file:

988

s = s.strip()

989

if not s or s.startswith('#'):

990

continue

991

name, chars = s.split(';')

992

chars = tuple(int(char, 16) for char in chars.split())

993

# check that the structure defined in makeunicodename is OK

994

assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"

995

assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "

996

"the NamedSequence struct and in unicodedata_lookup")

997

self.named_sequences.append((name, chars))

998

# also store these in the PUA 1

999

self.table[pua_index][1] = name

1000

pua_index += 1

1001

assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)

1002

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

1003

self.exclusions = {}

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1004

with open_data(COMPOSITION_EXCLUSIONS, version) as file:

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

char = int(s.split()[0],16)

1012

self.exclusions[char] = 1

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

1013

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

1014

widths = [None] * 0x110000

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1015

with open_data(EASTASIAN_WIDTH, version) as file:

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

s = s.split()[0].split(';')

1023

if '..' in s[0]:

1024

first, last = [int(c, 16) for c in s[0].split('..')]

1025

chars = list(range(first, last+1))

1026

else:

1027

chars = [int(s[0], 16)]

for char in chars:

widths[char] = s[1]

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

1031

for i in range(0, 0x110000):

1032

if table[i] is not None:

1033

table[i].append(widths[i])

1034

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1035

for i in range(0, 0x110000):

1036

if table[i] is not None:

1037

table[i].append(set())

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1038

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1039

with open_data(DERIVED_CORE_PROPERTIES, version) as file:

1040

for s in file:

1041

s = s.split('#', 1)[0].strip()

1042

if not s:

1043

continue

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1044

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

r, p = s.split(";")

r = r.strip()

p = p.strip()

if ".." in r:

first, last = [int(c, 16) for c in r.split('..')]

1050

chars = list(range(first, last+1))

else:

chars = [int(r, 16)]

for char in chars:

if table[char]:

# Some properties (e.g. Default_Ignorable_Code_Point)

1056

# apply to unassigned code points; ignore them

1057

table[char][-1].add(p)

1058

1059

with open_data(LINE_BREAK, version) as file:

1060

for s in file:

1061

s = s.partition('#')[0]

1062

s = [i.strip() for i in s.split(';')]

1063

if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:

1064

continue

1065

if '..' not in s[0]:

1066

first = last = int(s[0], 16)

1067

else:

1068

first, last = [int(c, 16) for c in s[0].split('..')]

1069

for char in range(first, last+1):

1070

table[char][-1].add('Line_Break')

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

1071

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1072

# We only want the quickcheck properties

1073

# Format: NF?_QC; Y(es)/N(o)/M(aybe)

1074

# Yes is the default, hence only N and M occur

1075

# In 3.2.0, the format was different (NF?_NO)

1076

# The parsing will incorrectly determine these as

1077

# "yes", however, unicodedata.c will not perform quickchecks

1078

# for older versions, and no delta records will be created.

1079

quickchecks = [0] * 0x110000

1080

qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1081

with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:

for s in file:

if '#' in s:

s = s[:s.index('#')]

s = [i.strip() for i in s.split(';')]

1086

if len(s) < 2 or s[1] not in qc_order:

1087

continue

1088

quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No

1089

quickcheck_shift = qc_order.index(s[1])*2

1090

quickcheck <<= quickcheck_shift

1091

if '..' not in s[0]:

1092

first = last = int(s[0], 16)

1093

else:

1094

first, last = [int(c, 16) for c in s[0].split('..')]

1095

for char in range(first, last+1):

1096

assert not (quickchecks[char]>>quickcheck_shift)&3

1097

quickchecks[char] |= quickcheck

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1098

for i in range(0, 0x110000):

1099

if table[i] is not None:

1100

table[i].append(quickchecks[i])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

1101

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1102

with open_data(UNIHAN, version) as file:

1103

zip = zipfile.ZipFile(file)

1104

if version == '3.2.0':

1105

data = zip.open('Unihan-3.2.0.txt').read()

1106

else:

1107

data = zip.open('Unihan_NumericValues.txt').read()

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1108

for line in data.decode("utf-8").splitlines():

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

1109

if not line.startswith('U+'):

1110

continue

1111

code, tag, value = line.split(None, 3)[:3]

1112

if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',

1113

'kOtherNumeric'):

1114

continue

1115

value = value.strip().replace(',', '')

1116

i = int(code[2:], 16)

1117

# Patch the numeric field

1118

if table[i] is not None:

1119

table[i][8] = value

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

1120

sc = self.special_casing = {}

1121

with open_data(SPECIAL_CASING, version) as file:

1122

for s in file:

1123

s = s[:-1].split('#', 1)[0]

if not s:

continue

data = s.split("; ")

if data[4]:

# We ignore all conditionals (since they depend on

1129

# languages) except for one, which is hardcoded. See

1130

# handle_capital_sigma in unicodeobject.c.

1131

continue

1132

c = int(data[0], 16)

1133

lower = [int(char, 16) for char in data[1].split()]

1134

title = [int(char, 16) for char in data[2].split()]

1135

upper = [int(char, 16) for char in data[3].split()]

1136

sc[c] = (lower, title, upper)

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

1137

cf = self.case_folding = {}

1138

if version != '3.2.0':

1139

with open_data(CASE_FOLDING, version) as file:

1140

for s in file:

1141

s = s[:-1].split('#', 1)[0]

if not s:

continue

data = s.split("; ")

if data[1] in "CF":

c = int(data[0], 16)

cf[c] = [int(char, 16) for char in data[2].split()]

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

1148

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1149

def uselatin1(self):

1150

# restrict character range to ISO Latin 1

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

1151

self.chars = list(range(256))

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1152

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1153

# hash table tools

1154

1155

# this is a straight-forward reimplementation of Python's built-in

1156

# dictionary type, using a static data structure, and a custom string

1157

# hash algorithm.

1158

1159

def myhash(s, magic):

1160

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1161

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1162

h = (h * magic) + c

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

1163

ix = h & 0xff000000

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1164

if ix:

1165

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

return h

SIZES = [

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

1170

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

1171

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

1172

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

]

class Hash:

def __init__(self, name, data, magic):

1177

# turn a (key, value) list into a static hash table structure

1178

1179

# determine table size

1180

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

Ezio Melotti

1392500

2011-03-16 11:05:33 +0200

[diff] [blame]

1185

raise AssertionError("ran out of polynomials")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1186

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1187

print(size, "slots in hash table")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1188

1189

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

1198

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

incr = (h ^ (h >> 3)) & mask;

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1219

print(n, "collisions")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1220

self.collisions = n

1221

1222

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1232

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1233

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1234

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1235

file.write("#define %s_magic %d\n" % (self.name, self.magic))

1236

file.write("#define %s_size %d\n" % (self.name, self.size))

1237

file.write("#define %s_poly %d\n" % (self.name, self.poly))

1238

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1239

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1247

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1248

# write data to file, as a C array

1249

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1250

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1251

print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1252

file.write("static ")

1253

if size == 1:

1254

file.write("unsigned char")

1255

elif size == 2:

1256

file.write("unsigned short")

1257

else:

1258

file.write("unsigned int")

1259

file.write(" " + self.name + "[] = {\n")

1260

if self.data:

1261

s = " "

1262

for item in self.data:

1263

i = str(item) + ", "

1264

if len(s) + len(i) > 78:

file.write(s + "\n")

s = " " + i

else:

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1269

if s.strip():

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

file.write(s + "\n")

file.write("};\n\n")

def getsize(data):

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1283

def splitbins(t, trace=0):

1284

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

1285

1286

t is a sequence of ints. This function can be useful to save space if

1287

many of the ints are the same. t1 and t2 are lists of ints, and shift

1288

is an int, chosen to minimize the combined size of t1 and t2 (in C

1289

code), and where for each i in range(len(t)),

1290

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1291

where mask is a bitmask isolating the last "shift" bits.

1292

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1293

If optional arg trace is non-zero (default zero), progress info

1294

is printed to sys.stderr. The higher the value, the more info

1295

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1296

"""

1297

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1298

if trace:

1299

def dump(t1, t2, shift, bytes):

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1300

print("%d+%d bins at shift %d; %d bytes" % (

1301

len(t1), len(t2), shift, bytes), file=sys.stderr)

1302

print("Size of original table:", len(t)*getsize(t), \

1303

"bytes", file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1304

n = len(t)-1 # last valid index

1305

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

Christian Heimes

a37d4c6

2007-12-04 23:02:19 +0000

[diff] [blame]

1311

bytes = sys.maxsize # smallest total size so far

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1312

t = tuple(t) # so slices can be dict keys

1313

for shift in range(maxshift + 1):

1314

t1 = []

1315

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1316

size = 2**shift

1317

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1318

for i in range(0, len(t), size):

1319

bin = t[i:i+size]

1320

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1321

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1322

index = len(t2)

1323

bincache[bin] = index

1324

t2.extend(bin)

1325

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1326

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1327

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1328

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1329

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1330

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1331

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1332

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1333

t1, t2, shift = best

1334

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1335

print("Best:", end=' ', file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1336

dump(t1, t2, shift, bytes)

1337

if __debug__:

1338

# exhaustively verify that the decomposition is correct

1339

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

Guido van Rossum

805365e

2007-05-07 22:24:25 +0000

[diff] [blame]

1340

for i in range(len(t)):

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1341

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1342

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1343

1344

if __name__ == "__main__":

Fredrik Lundh