Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython3

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

4

# this script converts a unicode 3.2 database file to

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

5

# Modules/unicodedata_db.h, Modules/unicodename_db.h,

6

# and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

7

#

8

# history:

9

# 2000-09-24 fl created (based on bits and pieces from unidb)

10

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

11

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

12

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

13

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

14

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

15

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

16

# 2002-09-11 wd use string methods

17

# 2002-10-18 mvl update to Unicode 3.2

18

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

19

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

20

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

21

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

22

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

23

# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

24

# 2011-10-21 ezio add support for name aliases and named sequences

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

25

# 2012-01 benjamin add full case mappings

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

26

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

27

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

28

#

29

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

import os

import sys

import zipfile

from textwrap import dedent

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

35

36

SCRIPT = sys.argv[0]

Amaury Forgeot d'Arc

324ac65

2010-08-18 20:44:58 +0000

[diff] [blame]

37

VERSION = "3.2"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

38

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

39

# The Unicode Database

R David Murray

7445a38

2014-10-09 17:30:33 -0400

[diff] [blame^]

40

# --------------------

41

# When changing UCD version please update

42

# * Doc/library/stdtypes.rst, and

43

# * Doc/library/unicodedata.rst

Benjamin Peterson

94d08d9

2013-10-10 17:24:45 -0400

[diff] [blame]

44

UNIDATA_VERSION = "6.3.0"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

45

UNICODE_DATA = "UnicodeData%s.txt"

46

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

47

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

48

UNIHAN = "Unihan%s.zip"

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

49

DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

50

DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

51

LINE_BREAK = "LineBreak%s.txt"

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

52

NAME_ALIASES = "NameAliases%s.txt"

53

NAMED_SEQUENCES = "NamedSequences%s.txt"

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

54

SPECIAL_CASING = "SpecialCasing%s.txt"

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

55

CASE_FOLDING = "CaseFolding%s.txt"

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

56

57

# Private Use Areas -- in planes 1, 15, 16

58

PUA_1 = range(0xE000, 0xF900)

59

PUA_15 = range(0xF0000, 0xFFFFE)

60

PUA_16 = range(0x100000, 0x10FFFE)

61

62

# we use this ranges of PUA_15 to store name aliases and named sequences

63

NAME_ALIASES_START = 0xF0000

Benjamin Peterson

2012-02-20 22:24:29 -0500

[diff] [blame]

64

NAMED_SEQUENCES_START = 0xF0200

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

65

66

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

67

68

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

69

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

70

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

71

"So" ]

72

73

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

74

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

Benjamin Peterson

94d08d9

2013-10-10 17:24:45 -0400

[diff] [blame]

75

"ON", "LRI", "RLI", "FSI", "PDI" ]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

76

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

77

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

78

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

79

MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]

80

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

81

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

86

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

87

SPACE_MASK = 0x20

88

TITLE_MASK = 0x40

89

UPPER_MASK = 0x80

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

90

XID_START_MASK = 0x100

91

XID_CONTINUE_MASK = 0x200

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

92

PRINTABLE_MASK = 0x400

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

93

NUMERIC_MASK = 0x800

94

CASE_IGNORABLE_MASK = 0x1000

95

CASED_MASK = 0x2000

96

EXTENDED_CASE_MASK = 0x4000

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

97

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

98

# these ranges need to match unicodedata.c:is_unified_ideograph

99

cjk_ranges = [

100

('3400', '4DB5'),

Benjamin Peterson

2012-02-20 22:24:29 -0500

[diff] [blame]

101

('4E00', '9FCC'),

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

('20000', '2A6D6'),

('2A700', '2B734'),

('2B740', '2B81D')

]

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

107

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

108

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

109

print("--- Reading", UNICODE_DATA % "", "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

110

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

111

version = ""

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

112

unicode = UnicodeData(UNIDATA_VERSION)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

113

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

114

print(len(list(filter(None, unicode.table))), "characters")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

115

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

116

for version in old_versions:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

117

print("--- Reading", UNICODE_DATA % ("-"+version), "...")

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

118

old_unicode = UnicodeData(version, cjk_check=False)

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

119

print(len(list(filter(None, old_unicode.table))), "characters")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

120

merge_old_version(version, unicode, old_unicode)

121

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

122

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

123

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

124

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

125

126

# --------------------------------------------------------------------

127

# unicode character properties

128

129

def makeunicodedata(unicode, trace):

130

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

131

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

132

table = [dummy]

133

cache = {0: dummy}

134

index = [0] * len(unicode.chars)

135

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

136

FILE = "Modules/unicodedata_db.h"

137

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

138

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

139

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

140

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

141

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

142

for char in unicode.chars:

143

record = unicode.table[char]

144

if record:

145

# extract database properties

146

category = CATEGORY_NAMES.index(record[2])

147

combining = int(record[3])

148

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

149

mirrored = record[9] == "Y"

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

150

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

151

normalizationquickcheck = record[17]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

152

item = (

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

153

category, combining, bidirectional, mirrored, eastasianwidth,

154

normalizationquickcheck

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

155

)

156

# add entry to index and item tables

157

i = cache.get(item)

158

if i is None:

159

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

163

# 2) decomposition data

164

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

165

decomp_data = [0]

166

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

167

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

168

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

169

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

170

comp_pairs = []

171

comp_first = [None] * len(unicode.chars)

172

comp_last = [None] * len(unicode.chars)

173

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

174

for char in unicode.chars:

175

record = unicode.table[char]

176

if record:

177

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

178

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

179

if len(decomp) > 19:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

180

raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

181

# prefix

182

if decomp[0][0] == "<":

183

prefix = decomp.pop(0)

184

else:

185

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

186

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

187

i = decomp_prefix.index(prefix)

188

except ValueError:

189

i = len(decomp_prefix)

190

decomp_prefix.append(prefix)

191

prefix = i

192

assert prefix < 256

193

# content

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

194

decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

195

# Collect NFC pairs

196

if not prefix and len(decomp) == 3 and \

197

char not in unicode.exclusions and \

198

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

203

try:

204

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

205

except ValueError:

206

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

207

decomp_data.extend(decomp)

208

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

209

else:

210

i = 0

211

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

212

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

213

f = l = 0

214

comp_first_ranges = []

215

comp_last_ranges = []

216

prev_f = prev_l = None

217

for i in unicode.chars:

218

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

224

prev_f = prev_f[0],i

225

else:

226

comp_first_ranges.append(prev_f)

227

prev_f = (i,i)

228

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

234

prev_l = prev_l[0],i

235

else:

236

comp_last_ranges.append(prev_l)

237

prev_l = (i,i)

238

comp_first_ranges.append(prev_f)

239

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

244

for f,l,char in comp_pairs:

245

f = comp_first[f]

246

l = comp_last[l]

247

comp_data[f*total_last+l] = char

248

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

249

print(len(table), "unique properties")

250

print(len(decomp_prefix), "unique decomposition prefixes")

251

print(len(decomp_data), "unique decomposition entries:", end=' ')

252

print(decomp_size, "bytes")

253

print(total_first, "first characters in NFC")

254

print(total_last, "last characters in NFC")

255

print(len(comp_pairs), "NFC pairs")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

256

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

257

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

258

Fred Drake

9c68505

2000-10-26 03:56:46 +0000

[diff] [blame]

259

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

260

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

261

print(file=fp)

262

print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)

263

print("/* a list of unique database records */", file=fp)

264

print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

265

for item in table:

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

266

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

267

print("};", file=fp)

268

print(file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

269

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

270

print("/* Reindexing of NFC first characters. */", file=fp)

271

print("#define TOTAL_FIRST",total_first, file=fp)

272

print("#define TOTAL_LAST",total_last, file=fp)

273

print("struct reindex{int start;short count,index;};", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

274

print("static struct reindex nfc_first[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

275

for start,end in comp_first_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

276

print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)

277

print(" {0,0,0}", file=fp)

278

print("};\n", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

279

print("static struct reindex nfc_last[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

280

for start,end in comp_last_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

281

print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)

282

print(" {0,0,0}", file=fp)

283

print("};\n", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

284

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

285

# FIXME: <fl> the following tables could be made static, and

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

286

# the support code moved into unicodedatabase.c

287

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

288

print("/* string literals */", file=fp)

289

print("const char *_PyUnicode_CategoryNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

290

for name in CATEGORY_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

291

print(" \"%s\"," % name, file=fp)

292

print(" NULL", file=fp)

293

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

294

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

295

print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

296

for name in BIDIRECTIONAL_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

297

print(" \"%s\"," % name, file=fp)

298

print(" NULL", file=fp)

299

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

300

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

301

print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

302

for name in EASTASIANWIDTH_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

303

print(" \"%s\"," % name, file=fp)

304

print(" NULL", file=fp)

305

print("};", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

306

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

307

print("static const char *decomp_prefix[] = {", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

308

for name in decomp_prefix:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

309

print(" \"%s\"," % name, file=fp)

310

print(" NULL", file=fp)

311

print("};", file=fp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

312

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

313

# split record index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

314

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

315

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

316

print("/* index tables for the database records */", file=fp)

317

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

318

Array("index1", index1).dump(fp, trace)

319

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

320

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

321

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

322

index1, index2, shift = splitbins(decomp_index, trace)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

323

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

324

print("/* decomposition data */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

325

Array("decomp_data", decomp_data).dump(fp, trace)

326

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

327

print("/* index tables for the decomposition data */", file=fp)

328

print("#define DECOMP_SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

329

Array("decomp_index1", index1).dump(fp, trace)

330

Array("decomp_index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

331

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

332

index, index2, shift = splitbins(comp_data, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

333

print("/* NFC pairs */", file=fp)

334

print("#define COMP_SHIFT", shift, file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

335

Array("comp_index", index).dump(fp, trace)

336

Array("comp_data", index2).dump(fp, trace)

337

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

338

# Generate delta tables for old versions

339

for version, table, normalization in unicode.changed:

340

cversion = version.replace(".","_")

341

records = [table[0]]

342

cache = {table[0]:0}

343

index = [0] * len(table)

344

for i, record in enumerate(table):

345

try:

346

index[i] = cache[record]

347

except KeyError:

348

index[i] = cache[record] = len(records)

349

records.append(record)

350

index1, index2, shift = splitbins(index, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

351

print("static const change_record change_records_%s[] = {" % cversion, file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

352

for record in records:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

353

print("\t{ %s }," % ", ".join(map(str,record)), file=fp)

354

print("};", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

355

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

356

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

357

print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)

358

print("{", file=fp)

359

print("\tint index;", file=fp)

360

print("\tif (n >= 0x110000) index = 0;", file=fp)

361

print("\telse {", file=fp)

362

print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)

363

print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \

364

(cversion, shift, ((1<<shift)-1)), file=fp)

365

print("\t}", file=fp)

366

print("\treturn change_records_%s+index;" % cversion, file=fp)

367

print("}\n", file=fp)

368

print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)

369

print("{", file=fp)

370

print("\tswitch(n) {", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

371

for k, v in normalization:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

372

print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)

373

print("\tdefault: return 0;", file=fp)

374

print("\t}\n}\n", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

375

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

376

fp.close()

377

378

# --------------------------------------------------------------------

379

# unicode character type tables

380

381

def makeunicodetype(unicode, trace):

382

383

FILE = "Objects/unicodetype_db.h"

384

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

385

print("--- Preparing", FILE, "...")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

386

387

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

388

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

389

table = [dummy]

390

cache = {0: dummy}

391

index = [0] * len(unicode.chars)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

392

numeric = {}

393

spaces = []

394

linebreaks = []

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

395

extra_casing = []

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

396

397

for char in unicode.chars:

398

record = unicode.table[char]

399

if record:

400

# extract database properties

401

category = record[2]

402

bidirectional = record[4]

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

403

properties = record[16]

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

404

flags = 0

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

405

delta = True

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

406

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

407

flags |= ALPHA_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

408

if "Lowercase" in properties:

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

409

flags |= LOWER_MASK

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

410

if 'Line_Break' in properties or bidirectional == "B":

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

411

flags |= LINEBREAK_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

412

linebreaks.append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

413

if category == "Zs" or bidirectional in ("WS", "B", "S"):

414

flags |= SPACE_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

415

spaces.append(char)

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

416

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

417

flags |= TITLE_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

418

if "Uppercase" in properties:

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

419

flags |= UPPER_MASK

Benjamin Peterson

0983274

2009-03-26 17:15:46 +0000

[diff] [blame]

420

if char == ord(" ") or category[0] not in ("C", "Z"):

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

421

flags |= PRINTABLE_MASK

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

422

if "XID_Start" in properties:

423

flags |= XID_START_MASK

424

if "XID_Continue" in properties:

425

flags |= XID_CONTINUE_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

426

if "Cased" in properties:

427

flags |= CASED_MASK

428

if "Case_Ignorable" in properties:

429

flags |= CASE_IGNORABLE_MASK

430

sc = unicode.special_casing.get(char)

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

431

cf = unicode.case_folding.get(char, [char])

432

if record[12]:

433

upper = int(record[12], 16)

else:

upper = char

if record[13]:

lower = int(record[13], 16)

else:

lower = char

if record[14]:

title = int(record[14], 16)

442

else:

443

title = upper

444

if sc is None and cf != [lower]:

445

sc = ([lower], [title], [upper])

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

446

if sc is None:

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

447

if upper == lower == title:

448

upper = lower = title = 0

Benjamin Peterson

ad9c569

2012-01-15 21:19:20 -0500

[diff] [blame]

else:

upper = upper - char

lower = lower - char

title = title - char

assert (abs(upper) <= 2147483647 and

454

abs(lower) <= 2147483647 and

455

abs(title) <= 2147483647)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

456

else:

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

457

# This happens either when some character maps to more than one

458

# character in uppercase, lowercase, or titlecase or the

459

# casefolded version of the character is different from the

460

# lowercase. The extra characters are stored in a different

461

# array.

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

462

flags |= EXTENDED_CASE_MASK

463

lower = len(extra_casing) | (len(sc[0]) << 24)

464

extra_casing.extend(sc[0])

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

465

if cf != sc[0]:

466

lower |= len(cf) << 20

467

extra_casing.extend(cf)

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

468

upper = len(extra_casing) | (len(sc[2]) << 24)

469

extra_casing.extend(sc[2])

470

# Title is probably equal to upper.

if sc[1] == sc[2]:

title = upper

else:

title = len(extra_casing) | (len(sc[1]) << 24)

475

extra_casing.extend(sc[1])

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

476

# decimal digit, integer digit

477

decimal = 0

478

if record[6]:

479

flags |= DECIMAL_MASK

480

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

485

if record[8]:

486

flags |= NUMERIC_MASK

487

numeric.setdefault(record[8], []).append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

488

item = (

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

489

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

490

)

491

# add entry to index and item tables

492

i = cache.get(item)

493

if i is None:

494

cache[item] = i = len(table)

table.append(item)

index[char] = i

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

498

print(len(table), "unique character type entries")

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

499

print(sum(map(len, numeric.values())), "numeric code points")

500

print(len(spaces), "whitespace code points")

501

print(len(linebreaks), "linebreak code points")

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

502

print(len(extra_casing), "extended case array")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

503

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

504

print("--- Writing", FILE, "...")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

505

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

506

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

507

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

508

print(file=fp)

509

print("/* a list of unique character type descriptors */", file=fp)

510

print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

511

for item in table:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

512

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

513

print("};", file=fp)

514

print(file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

515

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

516

print("/* extended case mappings */", file=fp)

517

print(file=fp)

518

print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)

519

for c in extra_casing:

520

print(" %d," % c, file=fp)

print("};", file=fp)

print(file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

524

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

525

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

526

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

527

print("/* type indexes */", file=fp)

528

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

529

Array("index1", index1).dump(fp, trace)

530

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

531

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

532

# Generate code for _PyUnicode_ToNumeric()

533

numeric_items = sorted(numeric.items())

534

print('/* Returns the numeric value as double for Unicode characters', file=fp)

535

print(' * having this property, -1.0 otherwise.', file=fp)

536

print(' */', file=fp)

Amaury Forgeot d'Arc

324ac65

2010-08-18 20:44:58 +0000

[diff] [blame]

537

print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

538

print('{', file=fp)

539

print(' switch (ch) {', file=fp)

540

for value, codepoints in numeric_items:

Amaury Forgeot d'Arc

919765a

2009-10-13 23:18:53 +0000

[diff] [blame]

541

# Turn text into float literals

542

parts = value.split('/')

543

parts = [repr(float(part)) for part in parts]

544

value = '/'.join(parts)

545

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

546

codepoints.sort()

547

for codepoint in codepoints:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

548

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

549

print(' return (double) %s;' % (value,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

550

print(' }', file=fp)

551

print(' return -1.0;', file=fp)

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsWhitespace()

556

print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)

557

print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)

558

print(" */", file=fp)

Antoine Pitrou

9ed5f27

2013-08-13 20:18:52 +0200

[diff] [blame]

559

print('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

560

print('{', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

561

print(' switch (ch) {', file=fp)

562

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

563

for codepoint in sorted(spaces):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

564

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

565

print(' return 1;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

566

567

print(' }', file=fp)

568

print(' return 0;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsLinebreak()

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

573

print("/* Returns 1 for Unicode characters having the line break", file=fp)

574

print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)

575

print(" * type 'B', 0 otherwise.", file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

576

print(" */", file=fp)

Antoine Pitrou

9ed5f27

2013-08-13 20:18:52 +0200

[diff] [blame]

577

print('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

578

print('{', file=fp)

579

print(' switch (ch) {', file=fp)

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

580

for codepoint in sorted(linebreaks):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

581

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

582

print(' return 1;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

583

584

print(' }', file=fp)

585

print(' return 0;', file=fp)

print('}', file=fp)

print(file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

589

fp.close()

590

591

# --------------------------------------------------------------------

592

# unicode name database

593

594

def makeunicodename(unicode, trace):

595

596

FILE = "Modules/unicodename_db.h"

597

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

598

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

599

600

# collect names

601

names = [None] * len(unicode.chars)

602

603

for char in unicode.chars:

604

record = unicode.table[char]

605

if record:

606

name = record[1].strip()

607

if name and name[0] != "<":

608

names[char] = name + chr(0)

609

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

610

print(len(list(n for n in names if n is not None)), "distinct names")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

611

612

# collect unique words from names (note that we differ between

613

# words inside a sentence, and words ending a sentence. the

614

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

630

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

631

print(n, "words in text;", b, "bytes")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

632

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

633

wordlist = list(words.items())

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

634

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

635

# sort on falling frequency, then by name

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

636

def word_key(a):

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

637

aword, alist = a

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

638

return -len(alist), aword

639

wordlist.sort(key=word_key)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

640

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

641

# figure out how many phrasebook escapes we need

642

escapes = 0

643

while escapes * 256 < len(wordlist):

644

escapes = escapes + 1

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

645

print(escapes, "escapes")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

646

647

short = 256 - escapes

assert short > 0

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

651

print(short, "short indexes in lexicon")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

652

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

653

# statistics

654

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

655

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

656

n = n + len(wordlist[i][1])

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

657

print(n, "short indexes in phrasebook")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

658

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

659

# pick the most commonly used words, and sort the rest on falling

660

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

661

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

662

wordlist, wordtail = wordlist[:short], wordlist[short:]

Raymond Hettinger

d4cb56d

2008-01-30 02:55:10 +0000

[diff] [blame]

663

wordtail.sort(key=lambda a: a[0], reverse=True)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

664

wordlist.extend(wordtail)

665

666

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

673

offset = 0

674

for w, x in wordlist:

675

# encoding: bit 7 indicates last character in word (chr(128)

676

# indicates the last character in an entire string)

677

ww = w[:-1] + chr(ord(w[-1])+128)

678

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

679

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

680

if o < 0:

681

o = offset

682

lexicon = lexicon + ww

683

offset = offset + len(w)

684

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

685

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

686

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

687

lexicon = list(map(ord, lexicon))

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

688

689

# generate phrasebook from names and lexicon

690

phrasebook = [0]

691

phrasebook_offset = [0] * len(unicode.chars)

692

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

697

for w in w:

698

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

699

if i < short:

700

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

701

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

702

# store as two bytes

703

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

704

phrasebook.append(i&255)

705

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

706

assert getsize(phrasebook) == 1

707

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

708

#

709

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

714

record = unicode.table[char]

715

if record:

716

name = record[1].strip()

717

if name and name[0] != "<":

718

data.append((name, char))

719

720

# the magic number 47 was chosen to minimize the number of

721

# collisions on the current data set. if you like, change it

722

# and see what happens...

723

724

codehash = Hash("code", data, 47)

725

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

726

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

727

728

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

729

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

730

print(file=fp)

731

print("#define NAME_MAXLEN", 256, file=fp)

732

print(file=fp)

733

print("/* lexicon */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

734

Array("lexicon", lexicon).dump(fp, trace)

735

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

736

737

# split decomposition index table

738

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

739

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

740

print("/* code->name phrasebook */", file=fp)

741

print("#define phrasebook_shift", shift, file=fp)

742

print("#define phrasebook_short", short, file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

743

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

744

Array("phrasebook", phrasebook).dump(fp, trace)

745

Array("phrasebook_offset1", offset1).dump(fp, trace)

746

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

747

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

748

print("/* name->code dictionary */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

749

codehash.dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

750

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

751

print(file=fp)

752

print('static const unsigned int aliases_start = %#x;' %

753

NAME_ALIASES_START, file=fp)

754

print('static const unsigned int aliases_end = %#x;' %

755

(NAME_ALIASES_START + len(unicode.aliases)), file=fp)

756

757

print('static const unsigned int name_aliases[] = {', file=fp)

758

for name, codepoint in unicode.aliases:

759

print(' 0x%04X,' % codepoint, file=fp)

760

print('};', file=fp)

761

762

# In Unicode 6.0.0, the sequences contain at most 4 BMP chars,

763

# so we are using Py_UCS2 seq[4]. This needs to be updated if longer

764

# sequences or sequences with non-BMP chars are added.

765

# unicodedata_lookup should be adapted too.

766

print(dedent("""

767

typedef struct NamedSequence {

int seqlen;

Py_UCS2 seq[4];

} named_sequence;

"""), file=fp)

print('static const unsigned int named_sequences_start = %#x;' %

774

NAMED_SEQUENCES_START, file=fp)

775

print('static const unsigned int named_sequences_end = %#x;' %

776

(NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)

777

778

print('static const named_sequence named_sequences[] = {', file=fp)

779

for name, sequence in unicode.named_sequences:

780

seq_str = ', '.join('0x%04X' % cp for cp in sequence)

781

print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)

782

print('};', file=fp)

783

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

784

fp.close()

785

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

786

787

def merge_old_version(version, new, old):

788

# Changes to exclusion file not implemented yet

789

if old.exclusions != new.exclusions:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

790

raise NotImplementedError("exclusions differ")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

791

792

# In these change records, 0xFF means "no change"

793

bidir_changes = [0xFF]*0x110000

794

category_changes = [0xFF]*0x110000

795

decimal_changes = [0xFF]*0x110000

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

796

mirrored_changes = [0xFF]*0x110000

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

797

# In numeric data, 0 means "no change",

798

# -1 means "did not have a numeric value

799

numeric_changes = [0] * 0x110000

800

# normalization_changes is a list of key-value pairs

801

normalization_changes = []

802

for i in range(0x110000):

803

if new.table[i] is None:

804

# Characters unassigned in the new version ought to

805

# be unassigned in the old one

806

assert old.table[i] is None

807

continue

808

# check characters unassigned in the old version

809

if old.table[i] is None:

810

# category 0 is "unassigned"

811

category_changes[i] = 0

812

continue

813

# check characters that differ

814

if old.table[i] != new.table[i]:

815

for k in range(len(old.table[i])):

816

if old.table[i][k] != new.table[i][k]:

817

value = old.table[i][k]

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

818

if k == 1 and i in PUA_15:

819

# the name is not set in the old.table, but in the

820

# new.table we are using it for aliases and named seq

821

assert value == ''

822

elif k == 2:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

823

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

824

category_changes[i] = CATEGORY_NAMES.index(value)

825

elif k == 4:

826

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

827

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

828

elif k == 5:

829

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

830

# We assume that all normalization changes are in 1:1 mappings

831

assert " " not in value

832

normalization_changes.append((i, value))

833

elif k == 6:

834

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

835

# we only support changes where the old value is a single digit

836

assert value in "0123456789"

837

decimal_changes[i] = int(value)

838

elif k == 8:

839

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

840

# Since 0 encodes "no change", the old value is better not 0

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

841

if not value:

842

numeric_changes[i] = -1

843

else:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

844

numeric_changes[i] = float(value)

845

assert numeric_changes[i] not in (0, -1)

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

846

elif k == 9:

847

if value == 'Y':

848

mirrored_changes[i] = '1'

849

else:

850

mirrored_changes[i] = '0'

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

851

elif k == 11:

852

# change to ISO comment, ignore

853

pass

854

elif k == 12:

855

# change to simple uppercase mapping; ignore

856

pass

857

elif k == 13:

858

# change to simple lowercase mapping; ignore

859

pass

860

elif k == 14:

861

# change to simple titlecase mapping; ignore

862

pass

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

863

elif k == 16:

864

# derived property changes; not yet

865

pass

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

866

elif k == 17:

867

# normalization quickchecks are not performed

868

# for older versions

869

pass

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

870

else:

871

class Difference(Exception):pass

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

872

raise Difference(hex(i), k, old.table[i], new.table[i])

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

873

new.changed.append((version, list(zip(bidir_changes, category_changes,

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

874

decimal_changes, mirrored_changes,

875

numeric_changes)),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

876

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

877

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

878

def open_data(template, version):

879

local = template % ('-'+version,)

880

if not os.path.exists(local):

881

import urllib.request

882

if version == '3.2.0':

883

# irregular url structure

884

url = 'http://www.unicode.org/Public/3.2-Update/' + local

885

else:

886

url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')

887

urllib.request.urlretrieve(url, filename=local)

888

if local.endswith('.txt'):

889

return open(local, encoding='utf-8')

890

else:

891

# Unihan.zip

892

return open(local, 'rb')

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

893

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

894

# --------------------------------------------------------------------

895

# the following support code is taken from the unidb utilities

896

897

898

# load a unicode-data file from disk

899

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

900

class UnicodeData:

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

901

# Record structure:

902

# [ID, name, category, combining, bidi, decomp, (6)

903

# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)

904

# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)

905

# derived-props] (17)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

906

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

907

def __init__(self, version,

908

linebreakprops=False,

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

909

expand=1,

910

cjk_check=True):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

911

self.changed = []

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

912

table = [None] * 0x110000

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

913

with open_data(UNICODE_DATA, version) as file:

while 1:

s = file.readline()

if not s:

break

s = s.strip().split(";")

919

char = int(s[0], 16)

920

table[char] = s

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

921

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

922

cjk_ranges_found = []

923

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

924

# expand first-last ranges

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

925

if expand:

926

field = None

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

927

for i in range(0, 0x110000):

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

928

s = table[i]

929

if s:

930

if s[1][-6:] == "First>":

931

s[1] = ""

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

932

field = s

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

933

elif s[1][-5:] == "Last>":

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

934

if s[1].startswith("<CJK Ideograph"):

935

cjk_ranges_found.append((field[0],

936

s[0]))

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

937

s[1] = ""

938

field = None

939

elif field:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

940

f2 = field[:]

941

f2[0] = "%X" % i

942

table[i] = f2

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

943

if cjk_check and cjk_ranges != cjk_ranges_found:

944

raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

945

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

946

# public attributes

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

947

self.filename = UNICODE_DATA % ''

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

948

self.table = table

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

949

self.chars = list(range(0x110000)) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

950

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

951

# check for name aliases and named sequences, see #12753

952

# aliases and named sequences are not in 3.2.0

953

if version != '3.2.0':

954

self.aliases = []

955

# store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,

956

# in order to take advantage of the compression and lookup

957

# algorithms used for the other characters

958

pua_index = NAME_ALIASES_START

959

with open_data(NAME_ALIASES, version) as file:

960

for s in file:

961

s = s.strip()

962

if not s or s.startswith('#'):

963

continue

Benjamin Peterson

2012-02-20 22:24:29 -0500

[diff] [blame]

964

char, name, abbrev = s.split(';')

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

965

char = int(char, 16)

966

self.aliases.append((name, char))

967

# also store the name in the PUA 1

968

self.table[pua_index][1] = name

969

pua_index += 1

970

assert pua_index - NAME_ALIASES_START == len(self.aliases)

971

972

self.named_sequences = []

Ezio Melotti

7c4a7e6

2013-08-26 01:32:56 +0300

[diff] [blame]

973

# store named sequences in the PUA 1, in range U+F0100..,

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

974

# in order to take advantage of the compression and lookup

975

# algorithms used for the other characters.

976

Benjamin Peterson

2012-02-20 22:24:29 -0500

[diff] [blame]

977

assert pua_index < NAMED_SEQUENCES_START

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

978

pua_index = NAMED_SEQUENCES_START

979

with open_data(NAMED_SEQUENCES, version) as file:

980

for s in file:

981

s = s.strip()

982

if not s or s.startswith('#'):

983

continue

984

name, chars = s.split(';')

985

chars = tuple(int(char, 16) for char in chars.split())

986

# check that the structure defined in makeunicodename is OK

987

assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"

988

assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "

989

"the NamedSequence struct and in unicodedata_lookup")

990

self.named_sequences.append((name, chars))

991

# also store these in the PUA 1

992

self.table[pua_index][1] = name

993

pua_index += 1

994

assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)

995

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

996

self.exclusions = {}

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

997

with open_data(COMPOSITION_EXCLUSIONS, version) as file:

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

char = int(s.split()[0],16)

1005

self.exclusions[char] = 1

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

1006

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

1007

widths = [None] * 0x110000

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1008

with open_data(EASTASIAN_WIDTH, version) as file:

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

s = s.split()[0].split(';')

1016

if '..' in s[0]:

1017

first, last = [int(c, 16) for c in s[0].split('..')]

1018

chars = list(range(first, last+1))

1019

else:

1020

chars = [int(s[0], 16)]

for char in chars:

widths[char] = s[1]

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

1024

for i in range(0, 0x110000):

1025

if table[i] is not None:

1026

table[i].append(widths[i])

1027

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1028

for i in range(0, 0x110000):

1029

if table[i] is not None:

1030

table[i].append(set())

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1031

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1032

with open_data(DERIVED_CORE_PROPERTIES, version) as file:

1033

for s in file:

1034

s = s.split('#', 1)[0].strip()

1035

if not s:

1036

continue

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1037

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

r, p = s.split(";")

r = r.strip()

p = p.strip()

if ".." in r:

first, last = [int(c, 16) for c in r.split('..')]

1043

chars = list(range(first, last+1))

else:

chars = [int(r, 16)]

for char in chars:

if table[char]:

# Some properties (e.g. Default_Ignorable_Code_Point)

1049

# apply to unassigned code points; ignore them

1050

table[char][-1].add(p)

1051

1052

with open_data(LINE_BREAK, version) as file:

1053

for s in file:

1054

s = s.partition('#')[0]

1055

s = [i.strip() for i in s.split(';')]

1056

if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:

1057

continue

1058

if '..' not in s[0]:

1059

first = last = int(s[0], 16)

1060

else:

1061

first, last = [int(c, 16) for c in s[0].split('..')]

1062

for char in range(first, last+1):

1063

table[char][-1].add('Line_Break')

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

1064

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1065

# We only want the quickcheck properties

1066

# Format: NF?_QC; Y(es)/N(o)/M(aybe)

1067

# Yes is the default, hence only N and M occur

1068

# In 3.2.0, the format was different (NF?_NO)

1069

# The parsing will incorrectly determine these as

1070

# "yes", however, unicodedata.c will not perform quickchecks

1071

# for older versions, and no delta records will be created.

1072

quickchecks = [0] * 0x110000

1073

qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1074

with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:

for s in file:

if '#' in s:

s = s[:s.index('#')]

s = [i.strip() for i in s.split(';')]

1079

if len(s) < 2 or s[1] not in qc_order:

1080

continue

1081

quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No

1082

quickcheck_shift = qc_order.index(s[1])*2

1083

quickcheck <<= quickcheck_shift

1084

if '..' not in s[0]:

1085

first = last = int(s[0], 16)

1086

else:

1087

first, last = [int(c, 16) for c in s[0].split('..')]

1088

for char in range(first, last+1):

1089

assert not (quickchecks[char]>>quickcheck_shift)&3

1090

quickchecks[char] |= quickcheck

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1091

for i in range(0, 0x110000):

1092

if table[i] is not None:

1093

table[i].append(quickchecks[i])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

1094

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1095

with open_data(UNIHAN, version) as file:

1096

zip = zipfile.ZipFile(file)

1097

if version == '3.2.0':

1098

data = zip.open('Unihan-3.2.0.txt').read()

1099

else:

1100

data = zip.open('Unihan_NumericValues.txt').read()

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1101

for line in data.decode("utf-8").splitlines():

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

1102

if not line.startswith('U+'):

1103

continue

1104

code, tag, value = line.split(None, 3)[:3]

1105

if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',

1106

'kOtherNumeric'):

1107

continue

1108

value = value.strip().replace(',', '')

1109

i = int(code[2:], 16)

1110

# Patch the numeric field

1111

if table[i] is not None:

1112

table[i][8] = value

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

1113

sc = self.special_casing = {}

1114

with open_data(SPECIAL_CASING, version) as file:

1115

for s in file:

1116

s = s[:-1].split('#', 1)[0]

if not s:

continue

data = s.split("; ")

if data[4]:

# We ignore all conditionals (since they depend on

1122

# languages) except for one, which is hardcoded. See

1123

# handle_capital_sigma in unicodeobject.c.

1124

continue

1125

c = int(data[0], 16)

1126

lower = [int(char, 16) for char in data[1].split()]

1127

title = [int(char, 16) for char in data[2].split()]

1128

upper = [int(char, 16) for char in data[3].split()]

1129

sc[c] = (lower, title, upper)

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

1130

cf = self.case_folding = {}

1131

if version != '3.2.0':

1132

with open_data(CASE_FOLDING, version) as file:

1133

for s in file:

1134

s = s[:-1].split('#', 1)[0]

if not s:

continue

data = s.split("; ")

if data[1] in "CF":

c = int(data[0], 16)

cf[c] = [int(char, 16) for char in data[2].split()]

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

1141

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1142

def uselatin1(self):

1143

# restrict character range to ISO Latin 1

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

1144

self.chars = list(range(256))

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1145

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1146

# hash table tools

1147

1148

# this is a straight-forward reimplementation of Python's built-in

1149

# dictionary type, using a static data structure, and a custom string

1150

# hash algorithm.

1151

1152

def myhash(s, magic):

1153

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1154

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1155

h = (h * magic) + c

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

1156

ix = h & 0xff000000

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1157

if ix:

1158

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

return h

SIZES = [

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

1163

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

1164

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

1165

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

]

class Hash:

def __init__(self, name, data, magic):

1170

# turn a (key, value) list into a static hash table structure

1171

1172

# determine table size

1173

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

Ezio Melotti

1392500

2011-03-16 11:05:33 +0200

[diff] [blame]

1178

raise AssertionError("ran out of polynomials")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1179

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1180

print(size, "slots in hash table")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1181

1182

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

1191

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

incr = (h ^ (h >> 3)) & mask;

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1212

print(n, "collisions")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1213

self.collisions = n

1214

1215

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1225

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1226

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1227

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1228

file.write("#define %s_magic %d\n" % (self.name, self.magic))

1229

file.write("#define %s_size %d\n" % (self.name, self.size))

1230

file.write("#define %s_poly %d\n" % (self.name, self.poly))

1231

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1232

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1240

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1241

# write data to file, as a C array

1242

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1243

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1244

print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1245

file.write("static ")

1246

if size == 1:

1247

file.write("unsigned char")

1248

elif size == 2:

1249

file.write("unsigned short")

1250

else:

1251

file.write("unsigned int")

1252

file.write(" " + self.name + "[] = {\n")

1253

if self.data:

1254

s = " "

1255

for item in self.data:

1256

i = str(item) + ", "

1257

if len(s) + len(i) > 78:

file.write(s + "\n")

s = " " + i

else:

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1262

if s.strip():

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

file.write(s + "\n")

file.write("};\n\n")

def getsize(data):

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1276

def splitbins(t, trace=0):

1277

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

1278

1279

t is a sequence of ints. This function can be useful to save space if

1280

many of the ints are the same. t1 and t2 are lists of ints, and shift

1281

is an int, chosen to minimize the combined size of t1 and t2 (in C

1282

code), and where for each i in range(len(t)),

1283

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1284

where mask is a bitmask isolating the last "shift" bits.

1285

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1286

If optional arg trace is non-zero (default zero), progress info

1287

is printed to sys.stderr. The higher the value, the more info

1288

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1289

"""

1290

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1291

if trace:

1292

def dump(t1, t2, shift, bytes):

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1293

print("%d+%d bins at shift %d; %d bytes" % (

1294

len(t1), len(t2), shift, bytes), file=sys.stderr)

1295

print("Size of original table:", len(t)*getsize(t), \

1296

"bytes", file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1297

n = len(t)-1 # last valid index

1298

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

Christian Heimes

a37d4c6

2007-12-04 23:02:19 +0000

[diff] [blame]

1304

bytes = sys.maxsize # smallest total size so far

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1305

t = tuple(t) # so slices can be dict keys

1306

for shift in range(maxshift + 1):

1307

t1 = []

1308

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1309

size = 2**shift

1310

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1311

for i in range(0, len(t), size):

1312

bin = t[i:i+size]

1313

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1314

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1315

index = len(t2)

1316

bincache[bin] = index

1317

t2.extend(bin)

1318

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1319

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1320

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1321

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1322

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1323

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1324

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1325

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1326

t1, t2, shift = best

1327

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1328

print("Best:", end=' ', file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1329

dump(t1, t2, shift, bytes)

1330

if __debug__:

1331

# exhaustively verify that the decomposition is correct

1332

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

Guido van Rossum

805365e

2007-05-07 22:24:25 +0000

[diff] [blame]

1333

for i in range(len(t)):

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1334

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1335

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1336

1337

if __name__ == "__main__":

Fredrik Lundh