Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython3

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

4

# this script converts a unicode 3.2 database file to

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

5

# Modules/unicodedata_db.h, Modules/unicodename_db.h,

6

# and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

7

#

8

# history:

9

# 2000-09-24 fl created (based on bits and pieces from unidb)

10

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

11

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

12

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

13

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

14

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

15

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

16

# 2002-09-11 wd use string methods

17

# 2002-10-18 mvl update to Unicode 3.2

18

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

19

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

20

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

21

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

22

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

23

# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

24

# 2011-10-21 ezio add support for name aliases and named sequences

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

25

# 2012-01 benjamin add full case mappings

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

26

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

27

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

28

#

29

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

import os

import sys

import zipfile

from textwrap import dedent

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

35

36

SCRIPT = sys.argv[0]

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

37

VERSION = "3.2"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

38

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

39

# The Unicode Database

Benjamin Peterson

2012-02-20 22:24:29 -0500

[diff] [blame]

40

UNIDATA_VERSION = "6.1.0"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

41

UNICODE_DATA = "UnicodeData%s.txt"

42

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

43

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

44

UNIHAN = "Unihan%s.zip"

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

45

DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

46

DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

47

LINE_BREAK = "LineBreak%s.txt"

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

48

NAME_ALIASES = "NameAliases%s.txt"

49

NAMED_SEQUENCES = "NamedSequences%s.txt"

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

50

SPECIAL_CASING = "SpecialCasing%s.txt"

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

51

CASE_FOLDING = "CaseFolding%s.txt"

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

52

53

# Private Use Areas -- in planes 1, 15, 16

54

PUA_1 = range(0xE000, 0xF900)

55

PUA_15 = range(0xF0000, 0xFFFFE)

56

PUA_16 = range(0x100000, 0x10FFFE)

57

58

# we use this ranges of PUA_15 to store name aliases and named sequences

59

NAME_ALIASES_START = 0xF0000

Benjamin Peterson

2012-02-20 22:24:29 -0500

[diff] [blame]

60

NAMED_SEQUENCES_START = 0xF0200

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

61

62

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

63

64

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

65

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

66

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

67

"So" ]

68

69

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

70

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

71

"ON" ]

72

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

73

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

74

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

75

MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]

76

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

77

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

82

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

83

SPACE_MASK = 0x20

84

TITLE_MASK = 0x40

85

UPPER_MASK = 0x80

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

86

XID_START_MASK = 0x100

87

XID_CONTINUE_MASK = 0x200

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

88

PRINTABLE_MASK = 0x400

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

89

NUMERIC_MASK = 0x800

90

CASE_IGNORABLE_MASK = 0x1000

91

CASED_MASK = 0x2000

92

EXTENDED_CASE_MASK = 0x4000

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

93

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

94

# these ranges need to match unicodedata.c:is_unified_ideograph

95

cjk_ranges = [

96

('3400', '4DB5'),

Benjamin Peterson

2012-02-20 22:24:29 -0500

[diff] [blame]

97

('4E00', '9FCC'),

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

('20000', '2A6D6'),

('2A700', '2B734'),

('2B740', '2B81D')

]

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

103

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

104

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

105

print("--- Reading", UNICODE_DATA % "", "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

106

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

107

version = ""

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

108

unicode = UnicodeData(UNIDATA_VERSION)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

109

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

110

print(len(list(filter(None, unicode.table))), "characters")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

111

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

112

for version in old_versions:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

113

print("--- Reading", UNICODE_DATA % ("-"+version), "...")

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

114

old_unicode = UnicodeData(version, cjk_check=False)

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

115

print(len(list(filter(None, old_unicode.table))), "characters")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

116

merge_old_version(version, unicode, old_unicode)

117

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

118

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

119

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

120

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

121

122

# --------------------------------------------------------------------

123

# unicode character properties

124

125

def makeunicodedata(unicode, trace):

126

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

127

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

128

table = [dummy]

129

cache = {0: dummy}

130

index = [0] * len(unicode.chars)

131

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

132

FILE = "Modules/unicodedata_db.h"

133

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

134

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

135

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

136

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

137

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

138

for char in unicode.chars:

139

record = unicode.table[char]

140

if record:

141

# extract database properties

142

category = CATEGORY_NAMES.index(record[2])

143

combining = int(record[3])

144

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

145

mirrored = record[9] == "Y"

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

146

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

147

normalizationquickcheck = record[17]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

148

item = (

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

149

category, combining, bidirectional, mirrored, eastasianwidth,

150

normalizationquickcheck

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

151

)

152

# add entry to index and item tables

153

i = cache.get(item)

154

if i is None:

155

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

159

# 2) decomposition data

160

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

161

decomp_data = [0]

162

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

163

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

164

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

165

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

166

comp_pairs = []

167

comp_first = [None] * len(unicode.chars)

168

comp_last = [None] * len(unicode.chars)

169

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

170

for char in unicode.chars:

171

record = unicode.table[char]

172

if record:

173

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

174

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

175

if len(decomp) > 19:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

176

raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

177

# prefix

178

if decomp[0][0] == "<":

179

prefix = decomp.pop(0)

180

else:

181

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

182

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

183

i = decomp_prefix.index(prefix)

184

except ValueError:

185

i = len(decomp_prefix)

186

decomp_prefix.append(prefix)

187

prefix = i

188

assert prefix < 256

189

# content

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

190

decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

191

# Collect NFC pairs

192

if not prefix and len(decomp) == 3 and \

193

char not in unicode.exclusions and \

194

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

199

try:

200

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

201

except ValueError:

202

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

203

decomp_data.extend(decomp)

204

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

205

else:

206

i = 0

207

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

208

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

209

f = l = 0

210

comp_first_ranges = []

211

comp_last_ranges = []

212

prev_f = prev_l = None

213

for i in unicode.chars:

214

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

220

prev_f = prev_f[0],i

221

else:

222

comp_first_ranges.append(prev_f)

223

prev_f = (i,i)

224

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

230

prev_l = prev_l[0],i

231

else:

232

comp_last_ranges.append(prev_l)

233

prev_l = (i,i)

234

comp_first_ranges.append(prev_f)

235

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

240

for f,l,char in comp_pairs:

241

f = comp_first[f]

242

l = comp_last[l]

243

comp_data[f*total_last+l] = char

244

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

245

print(len(table), "unique properties")

246

print(len(decomp_prefix), "unique decomposition prefixes")

247

print(len(decomp_data), "unique decomposition entries:", end=' ')

248

print(decomp_size, "bytes")

249

print(total_first, "first characters in NFC")

250

print(total_last, "last characters in NFC")

251

print(len(comp_pairs), "NFC pairs")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

252

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

253

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

254

Fred Drake

9c68505

2000-10-26 03:56:46 +0000

[diff] [blame]

255

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

256

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

257

print(file=fp)

258

print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)

259

print("/* a list of unique database records */", file=fp)

260

print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

261

for item in table:

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

262

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

263

print("};", file=fp)

264

print(file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

265

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

266

print("/* Reindexing of NFC first characters. */", file=fp)

267

print("#define TOTAL_FIRST",total_first, file=fp)

268

print("#define TOTAL_LAST",total_last, file=fp)

269

print("struct reindex{int start;short count,index;};", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

270

print("static struct reindex nfc_first[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

271

for start,end in comp_first_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

272

print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)

273

print(" {0,0,0}", file=fp)

274

print("};\n", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

275

print("static struct reindex nfc_last[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

276

for start,end in comp_last_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

277

print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)

278

print(" {0,0,0}", file=fp)

279

print("};\n", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

280

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

281

# FIXME: <fl> the following tables could be made static, and

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

282

# the support code moved into unicodedatabase.c

283

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

284

print("/* string literals */", file=fp)

285

print("const char *_PyUnicode_CategoryNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

286

for name in CATEGORY_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

287

print(" \"%s\"," % name, file=fp)

288

print(" NULL", file=fp)

289

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

290

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

291

print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

292

for name in BIDIRECTIONAL_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

293

print(" \"%s\"," % name, file=fp)

294

print(" NULL", file=fp)

295

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

296

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

297

print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

298

for name in EASTASIANWIDTH_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

299

print(" \"%s\"," % name, file=fp)

300

print(" NULL", file=fp)

301

print("};", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

302

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

303

print("static const char *decomp_prefix[] = {", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

304

for name in decomp_prefix:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

305

print(" \"%s\"," % name, file=fp)

306

print(" NULL", file=fp)

307

print("};", file=fp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

308

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

309

# split record index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

310

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

311

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

312

print("/* index tables for the database records */", file=fp)

313

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

314

Array("index1", index1).dump(fp, trace)

315

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

316

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

317

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

318

index1, index2, shift = splitbins(decomp_index, trace)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

319

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

320

print("/* decomposition data */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

321

Array("decomp_data", decomp_data).dump(fp, trace)

322

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

323

print("/* index tables for the decomposition data */", file=fp)

324

print("#define DECOMP_SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

325

Array("decomp_index1", index1).dump(fp, trace)

326

Array("decomp_index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

327

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

328

index, index2, shift = splitbins(comp_data, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

329

print("/* NFC pairs */", file=fp)

330

print("#define COMP_SHIFT", shift, file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

331

Array("comp_index", index).dump(fp, trace)

332

Array("comp_data", index2).dump(fp, trace)

333

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

334

# Generate delta tables for old versions

335

for version, table, normalization in unicode.changed:

336

cversion = version.replace(".","_")

337

records = [table[0]]

338

cache = {table[0]:0}

339

index = [0] * len(table)

340

for i, record in enumerate(table):

341

try:

342

index[i] = cache[record]

343

except KeyError:

344

index[i] = cache[record] = len(records)

345

records.append(record)

346

index1, index2, shift = splitbins(index, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

347

print("static const change_record change_records_%s[] = {" % cversion, file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

348

for record in records:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

349

print("\t{ %s }," % ", ".join(map(str,record)), file=fp)

350

print("};", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

351

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

352

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

353

print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)

354

print("{", file=fp)

355

print("\tint index;", file=fp)

356

print("\tif (n >= 0x110000) index = 0;", file=fp)

357

print("\telse {", file=fp)

358

print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)

359

print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \

360

(cversion, shift, ((1<<shift)-1)), file=fp)

361

print("\t}", file=fp)

362

print("\treturn change_records_%s+index;" % cversion, file=fp)

363

print("}\n", file=fp)

364

print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)

365

print("{", file=fp)

366

print("\tswitch(n) {", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

367

for k, v in normalization:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

368

print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)

369

print("\tdefault: return 0;", file=fp)

370

print("\t}\n}\n", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

371

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

372

fp.close()

373

374

# --------------------------------------------------------------------

375

# unicode character type tables

376

377

def makeunicodetype(unicode, trace):

378

379

FILE = "Objects/unicodetype_db.h"

380

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

381

print("--- Preparing", FILE, "...")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

382

383

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

384

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

385

table = [dummy]

386

cache = {0: dummy}

387

index = [0] * len(unicode.chars)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

388

numeric = {}

389

spaces = []

390

linebreaks = []

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

391

extra_casing = []

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

392

393

for char in unicode.chars:

394

record = unicode.table[char]

395

if record:

396

# extract database properties

397

category = record[2]

398

bidirectional = record[4]

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

399

properties = record[16]

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

400

flags = 0

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

401

delta = True

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

402

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

403

flags |= ALPHA_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

404

if "Lowercase" in properties:

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

405

flags |= LOWER_MASK

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

406

if 'Line_Break' in properties or bidirectional == "B":

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

407

flags |= LINEBREAK_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

408

linebreaks.append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

409

if category == "Zs" or bidirectional in ("WS", "B", "S"):

410

flags |= SPACE_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

411

spaces.append(char)

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

412

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

413

flags |= TITLE_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

414

if "Uppercase" in properties:

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

415

flags |= UPPER_MASK

Benjamin Peterson

0983274

2009-03-26 17:15:46 +0000

[diff] [blame]

416

if char == ord(" ") or category[0] not in ("C", "Z"):

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

417

flags |= PRINTABLE_MASK

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

418

if "XID_Start" in properties:

419

flags |= XID_START_MASK

420

if "XID_Continue" in properties:

421

flags |= XID_CONTINUE_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

422

if "Cased" in properties:

423

flags |= CASED_MASK

424

if "Case_Ignorable" in properties:

425

flags |= CASE_IGNORABLE_MASK

426

sc = unicode.special_casing.get(char)

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

427

cf = unicode.case_folding.get(char, [char])

428

if record[12]:

429

upper = int(record[12], 16)

else:

upper = char

if record[13]:

lower = int(record[13], 16)

else:

lower = char

if record[14]:

title = int(record[14], 16)

438

else:

439

title = upper

440

if sc is None and cf != [lower]:

441

sc = ([lower], [title], [upper])

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

442

if sc is None:

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

443

if upper == lower == title:

444

upper = lower = title = 0

Benjamin Peterson

ad9c569

2012-01-15 21:19:20 -0500

[diff] [blame]

else:

upper = upper - char

lower = lower - char

title = title - char

assert (abs(upper) <= 2147483647 and

450

abs(lower) <= 2147483647 and

451

abs(title) <= 2147483647)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

452

else:

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

453

# This happens either when some character maps to more than one

454

# character in uppercase, lowercase, or titlecase or the

455

# casefolded version of the character is different from the

456

# lowercase. The extra characters are stored in a different

457

# array.

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

458

flags |= EXTENDED_CASE_MASK

459

lower = len(extra_casing) | (len(sc[0]) << 24)

460

extra_casing.extend(sc[0])

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

461

if cf != sc[0]:

462

lower |= len(cf) << 20

463

extra_casing.extend(cf)

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

464

upper = len(extra_casing) | (len(sc[2]) << 24)

465

extra_casing.extend(sc[2])

466

# Title is probably equal to upper.

if sc[1] == sc[2]:

title = upper

else:

title = len(extra_casing) | (len(sc[1]) << 24)

471

extra_casing.extend(sc[1])

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

472

# decimal digit, integer digit

473

decimal = 0

474

if record[6]:

475

flags |= DECIMAL_MASK

476

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

481

if record[8]:

482

flags |= NUMERIC_MASK

483

numeric.setdefault(record[8], []).append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

484

item = (

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

485

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

486

)

487

# add entry to index and item tables

488

i = cache.get(item)

489

if i is None:

490

cache[item] = i = len(table)

table.append(item)

index[char] = i

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

494

print(len(table), "unique character type entries")

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

495

print(sum(map(len, numeric.values())), "numeric code points")

496

print(len(spaces), "whitespace code points")

497

print(len(linebreaks), "linebreak code points")

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

498

print(len(extra_casing), "extended case array")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

499

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

500

print("--- Writing", FILE, "...")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

501

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

502

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

503

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

504

print(file=fp)

505

print("/* a list of unique character type descriptors */", file=fp)

506

print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

507

for item in table:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

508

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

509

print("};", file=fp)

510

print(file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

511

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

512

print("/* extended case mappings */", file=fp)

513

print(file=fp)

514

print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)

515

for c in extra_casing:

516

print(" %d," % c, file=fp)

print("};", file=fp)

print(file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

520

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

521

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

522

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

523

print("/* type indexes */", file=fp)

524

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

525

Array("index1", index1).dump(fp, trace)

526

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

527

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

528

# Generate code for _PyUnicode_ToNumeric()

529

numeric_items = sorted(numeric.items())

530

print('/* Returns the numeric value as double for Unicode characters', file=fp)

531

print(' * having this property, -1.0 otherwise.', file=fp)

532

print(' */', file=fp)

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

533

print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

534

print('{', file=fp)

535

print(' switch (ch) {', file=fp)

536

for value, codepoints in numeric_items:

Amaury Forgeot d'Arc

919765a

2009-10-13 23:18:53 +0000

[diff] [blame]

537

# Turn text into float literals

538

parts = value.split('/')

539

parts = [repr(float(part)) for part in parts]

540

value = '/'.join(parts)

541

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

542

codepoints.sort()

543

for codepoint in codepoints:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

544

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

545

print(' return (double) %s;' % (value,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

546

print(' }', file=fp)

547

print(' return -1.0;', file=fp)

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsWhitespace()

552

print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)

553

print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)

554

print(" */", file=fp)

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

555

print('int _PyUnicode_IsWhitespace(register const Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

556

print('{', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

557

print(' switch (ch) {', file=fp)

558

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

559

for codepoint in sorted(spaces):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

560

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

561

print(' return 1;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

562

563

print(' }', file=fp)

564

print(' return 0;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsLinebreak()

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

569

print("/* Returns 1 for Unicode characters having the line break", file=fp)

570

print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)

571

print(" * type 'B', 0 otherwise.", file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

572

print(" */", file=fp)

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

573

print('int _PyUnicode_IsLinebreak(register const Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

574

print('{', file=fp)

575

print(' switch (ch) {', file=fp)

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

576

for codepoint in sorted(linebreaks):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

577

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

578

print(' return 1;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

579

580

print(' }', file=fp)

581

print(' return 0;', file=fp)

print('}', file=fp)

print(file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

585

fp.close()

586

587

# --------------------------------------------------------------------

588

# unicode name database

589

590

def makeunicodename(unicode, trace):

591

592

FILE = "Modules/unicodename_db.h"

593

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

594

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

595

596

# collect names

597

names = [None] * len(unicode.chars)

598

599

for char in unicode.chars:

600

record = unicode.table[char]

601

if record:

602

name = record[1].strip()

603

if name and name[0] != "<":

604

names[char] = name + chr(0)

605

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

606

print(len(list(n for n in names if n is not None)), "distinct names")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

607

608

# collect unique words from names (note that we differ between

609

# words inside a sentence, and words ending a sentence. the

610

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

626

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

627

print(n, "words in text;", b, "bytes")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

628

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

629

wordlist = list(words.items())

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

630

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

631

# sort on falling frequency, then by name

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

632

def word_key(a):

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

633

aword, alist = a

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

634

return -len(alist), aword

635

wordlist.sort(key=word_key)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

636

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

637

# figure out how many phrasebook escapes we need

638

escapes = 0

639

while escapes * 256 < len(wordlist):

640

escapes = escapes + 1

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

641

print(escapes, "escapes")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

642

643

short = 256 - escapes

assert short > 0

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

647

print(short, "short indexes in lexicon")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

648

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

649

# statistics

650

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

651

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

652

n = n + len(wordlist[i][1])

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

653

print(n, "short indexes in phrasebook")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

654

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

655

# pick the most commonly used words, and sort the rest on falling

656

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

657

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

658

wordlist, wordtail = wordlist[:short], wordlist[short:]

Raymond Hettinger

d4cb56d

2008-01-30 02:55:10 +0000

[diff] [blame]

659

wordtail.sort(key=lambda a: a[0], reverse=True)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

660

wordlist.extend(wordtail)

661

662

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

669

offset = 0

670

for w, x in wordlist:

671

# encoding: bit 7 indicates last character in word (chr(128)

672

# indicates the last character in an entire string)

673

ww = w[:-1] + chr(ord(w[-1])+128)

674

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

675

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

676

if o < 0:

677

o = offset

678

lexicon = lexicon + ww

679

offset = offset + len(w)

680

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

681

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

682

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

683

lexicon = list(map(ord, lexicon))

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

684

685

# generate phrasebook from names and lexicon

686

phrasebook = [0]

687

phrasebook_offset = [0] * len(unicode.chars)

688

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

693

for w in w:

694

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

695

if i < short:

696

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

697

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

698

# store as two bytes

699

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

700

phrasebook.append(i&255)

701

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

702

assert getsize(phrasebook) == 1

703

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

704

#

705

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

710

record = unicode.table[char]

711

if record:

712

name = record[1].strip()

713

if name and name[0] != "<":

714

data.append((name, char))

715

716

# the magic number 47 was chosen to minimize the number of

717

# collisions on the current data set. if you like, change it

718

# and see what happens...

719

720

codehash = Hash("code", data, 47)

721

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

722

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

723

724

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

725

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

726

print(file=fp)

727

print("#define NAME_MAXLEN", 256, file=fp)

728

print(file=fp)

729

print("/* lexicon */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

730

Array("lexicon", lexicon).dump(fp, trace)

731

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

732

733

# split decomposition index table

734

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

735

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

736

print("/* code->name phrasebook */", file=fp)

737

print("#define phrasebook_shift", shift, file=fp)

738

print("#define phrasebook_short", short, file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

739

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

740

Array("phrasebook", phrasebook).dump(fp, trace)

741

Array("phrasebook_offset1", offset1).dump(fp, trace)

742

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

743

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

744

print("/* name->code dictionary */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

745

codehash.dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

746

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

747

print(file=fp)

748

print('static const unsigned int aliases_start = %#x;' %

749

NAME_ALIASES_START, file=fp)

750

print('static const unsigned int aliases_end = %#x;' %

751

(NAME_ALIASES_START + len(unicode.aliases)), file=fp)

752

753

print('static const unsigned int name_aliases[] = {', file=fp)

754

for name, codepoint in unicode.aliases:

755

print(' 0x%04X,' % codepoint, file=fp)

756

print('};', file=fp)

757

758

# In Unicode 6.0.0, the sequences contain at most 4 BMP chars,

759

# so we are using Py_UCS2 seq[4]. This needs to be updated if longer

760

# sequences or sequences with non-BMP chars are added.

761

# unicodedata_lookup should be adapted too.

762

print(dedent("""

763

typedef struct NamedSequence {

int seqlen;

Py_UCS2 seq[4];

} named_sequence;

"""), file=fp)

print('static const unsigned int named_sequences_start = %#x;' %

770

NAMED_SEQUENCES_START, file=fp)

771

print('static const unsigned int named_sequences_end = %#x;' %

772

(NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)

773

774

print('static const named_sequence named_sequences[] = {', file=fp)

775

for name, sequence in unicode.named_sequences:

776

seq_str = ', '.join('0x%04X' % cp for cp in sequence)

777

print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)

778

print('};', file=fp)

779

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

780

fp.close()

781

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

782

783

def merge_old_version(version, new, old):

784

# Changes to exclusion file not implemented yet

785

if old.exclusions != new.exclusions:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

786

raise NotImplementedError("exclusions differ")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

787

788

# In these change records, 0xFF means "no change"

789

bidir_changes = [0xFF]*0x110000

790

category_changes = [0xFF]*0x110000

791

decimal_changes = [0xFF]*0x110000

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

792

mirrored_changes = [0xFF]*0x110000

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

793

# In numeric data, 0 means "no change",

794

# -1 means "did not have a numeric value

795

numeric_changes = [0] * 0x110000

796

# normalization_changes is a list of key-value pairs

797

normalization_changes = []

798

for i in range(0x110000):

799

if new.table[i] is None:

800

# Characters unassigned in the new version ought to

801

# be unassigned in the old one

802

assert old.table[i] is None

803

continue

804

# check characters unassigned in the old version

805

if old.table[i] is None:

806

# category 0 is "unassigned"

807

category_changes[i] = 0

808

continue

809

# check characters that differ

810

if old.table[i] != new.table[i]:

811

for k in range(len(old.table[i])):

812

if old.table[i][k] != new.table[i][k]:

813

value = old.table[i][k]

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

814

if k == 1 and i in PUA_15:

815

# the name is not set in the old.table, but in the

816

# new.table we are using it for aliases and named seq

817

assert value == ''

818

elif k == 2:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

819

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

820

category_changes[i] = CATEGORY_NAMES.index(value)

821

elif k == 4:

822

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

823

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

824

elif k == 5:

825

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

826

# We assume that all normalization changes are in 1:1 mappings

827

assert " " not in value

828

normalization_changes.append((i, value))

829

elif k == 6:

830

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

831

# we only support changes where the old value is a single digit

832

assert value in "0123456789"

833

decimal_changes[i] = int(value)

834

elif k == 8:

835

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

836

# Since 0 encodes "no change", the old value is better not 0

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

837

if not value:

838

numeric_changes[i] = -1

839

else:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

840

numeric_changes[i] = float(value)

841

assert numeric_changes[i] not in (0, -1)

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

842

elif k == 9:

843

if value == 'Y':

844

mirrored_changes[i] = '1'

845

else:

846

mirrored_changes[i] = '0'

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

847

elif k == 11:

848

# change to ISO comment, ignore

849

pass

850

elif k == 12:

851

# change to simple uppercase mapping; ignore

852

pass

853

elif k == 13:

854

# change to simple lowercase mapping; ignore

855

pass

856

elif k == 14:

857

# change to simple titlecase mapping; ignore

858

pass

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

859

elif k == 16:

860

# derived property changes; not yet

861

pass

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

862

elif k == 17:

863

# normalization quickchecks are not performed

864

# for older versions

865

pass

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

866

else:

867

class Difference(Exception):pass

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

868

raise Difference(hex(i), k, old.table[i], new.table[i])

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

869

new.changed.append((version, list(zip(bidir_changes, category_changes,

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

870

decimal_changes, mirrored_changes,

871

numeric_changes)),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

872

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

873

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

874

def open_data(template, version):

875

local = template % ('-'+version,)

876

if not os.path.exists(local):

877

import urllib.request

878

if version == '3.2.0':

879

# irregular url structure

880

url = 'http://www.unicode.org/Public/3.2-Update/' + local

881

else:

882

url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')

883

urllib.request.urlretrieve(url, filename=local)

884

if local.endswith('.txt'):

885

return open(local, encoding='utf-8')

886

else:

887

# Unihan.zip

888

return open(local, 'rb')

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

889

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

890

# --------------------------------------------------------------------

891

# the following support code is taken from the unidb utilities

892

893

894

# load a unicode-data file from disk

895

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

896

class UnicodeData:

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

897

# Record structure:

898

# [ID, name, category, combining, bidi, decomp, (6)

899

# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)

900

# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)

901

# derived-props] (17)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

902

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

903

def __init__(self, version,

904

linebreakprops=False,

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

905

expand=1,

906

cjk_check=True):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

907

self.changed = []

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

908

table = [None] * 0x110000

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

909

with open_data(UNICODE_DATA, version) as file:

while 1:

s = file.readline()

if not s:

break

s = s.strip().split(";")

915

char = int(s[0], 16)

916

table[char] = s

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

917

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

918

cjk_ranges_found = []

919

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

920

# expand first-last ranges

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

921

if expand:

922

field = None

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

923

for i in range(0, 0x110000):

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

924

s = table[i]

925

if s:

926

if s[1][-6:] == "First>":

927

s[1] = ""

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

928

field = s

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

929

elif s[1][-5:] == "Last>":

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

930

if s[1].startswith("<CJK Ideograph"):

931

cjk_ranges_found.append((field[0],

932

s[0]))

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

933

s[1] = ""

934

field = None

935

elif field:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

936

f2 = field[:]

937

f2[0] = "%X" % i

938

table[i] = f2

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

939

if cjk_check and cjk_ranges != cjk_ranges_found:

940

raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

941

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

942

# public attributes

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

943

self.filename = UNICODE_DATA % ''

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

944

self.table = table

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

945

self.chars = list(range(0x110000)) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

946

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

947

# check for name aliases and named sequences, see #12753

948

# aliases and named sequences are not in 3.2.0

949

if version != '3.2.0':

950

self.aliases = []

951

# store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,

952

# in order to take advantage of the compression and lookup

953

# algorithms used for the other characters

954

pua_index = NAME_ALIASES_START

955

with open_data(NAME_ALIASES, version) as file:

956

for s in file:

957

s = s.strip()

958

if not s or s.startswith('#'):

959

continue

Benjamin Peterson

2012-02-20 22:24:29 -0500

[diff] [blame]

960

char, name, abbrev = s.split(';')

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

961

char = int(char, 16)

962

self.aliases.append((name, char))

963

# also store the name in the PUA 1

964

self.table[pua_index][1] = name

965

pua_index += 1

966

assert pua_index - NAME_ALIASES_START == len(self.aliases)

967

968

self.named_sequences = []

969

# store named seqences in the PUA 1, in range U+F0100..,

970

# in order to take advantage of the compression and lookup

971

# algorithms used for the other characters.

972

Benjamin Peterson

2012-02-20 22:24:29 -0500

[diff] [blame]

973

assert pua_index < NAMED_SEQUENCES_START

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

974

pua_index = NAMED_SEQUENCES_START

975

with open_data(NAMED_SEQUENCES, version) as file:

976

for s in file:

977

s = s.strip()

978

if not s or s.startswith('#'):

979

continue

980

name, chars = s.split(';')

981

chars = tuple(int(char, 16) for char in chars.split())

982

# check that the structure defined in makeunicodename is OK

983

assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"

984

assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "

985

"the NamedSequence struct and in unicodedata_lookup")

986

self.named_sequences.append((name, chars))

987

# also store these in the PUA 1

988

self.table[pua_index][1] = name

989

pua_index += 1

990

assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)

991

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

992

self.exclusions = {}

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

993

with open_data(COMPOSITION_EXCLUSIONS, version) as file:

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

char = int(s.split()[0],16)

1001

self.exclusions[char] = 1

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

1002

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

1003

widths = [None] * 0x110000

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1004

with open_data(EASTASIAN_WIDTH, version) as file:

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

s = s.split()[0].split(';')

1012

if '..' in s[0]:

1013

first, last = [int(c, 16) for c in s[0].split('..')]

1014

chars = list(range(first, last+1))

1015

else:

1016

chars = [int(s[0], 16)]

for char in chars:

widths[char] = s[1]

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

1020

for i in range(0, 0x110000):

1021

if table[i] is not None:

1022

table[i].append(widths[i])

1023

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1024

for i in range(0, 0x110000):

1025

if table[i] is not None:

1026

table[i].append(set())

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1027

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1028

with open_data(DERIVED_CORE_PROPERTIES, version) as file:

1029

for s in file:

1030

s = s.split('#', 1)[0].strip()

1031

if not s:

1032

continue

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1033

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

r, p = s.split(";")

r = r.strip()

p = p.strip()

if ".." in r:

first, last = [int(c, 16) for c in r.split('..')]

1039

chars = list(range(first, last+1))

else:

chars = [int(r, 16)]

for char in chars:

if table[char]:

# Some properties (e.g. Default_Ignorable_Code_Point)

1045

# apply to unassigned code points; ignore them

1046

table[char][-1].add(p)

1047

1048

with open_data(LINE_BREAK, version) as file:

1049

for s in file:

1050

s = s.partition('#')[0]

1051

s = [i.strip() for i in s.split(';')]

1052

if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:

1053

continue

1054

if '..' not in s[0]:

1055

first = last = int(s[0], 16)

1056

else:

1057

first, last = [int(c, 16) for c in s[0].split('..')]

1058

for char in range(first, last+1):

1059

table[char][-1].add('Line_Break')

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

1060

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1061

# We only want the quickcheck properties

1062

# Format: NF?_QC; Y(es)/N(o)/M(aybe)

1063

# Yes is the default, hence only N and M occur

1064

# In 3.2.0, the format was different (NF?_NO)

1065

# The parsing will incorrectly determine these as

1066

# "yes", however, unicodedata.c will not perform quickchecks

1067

# for older versions, and no delta records will be created.

1068

quickchecks = [0] * 0x110000

1069

qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1070

with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:

for s in file:

if '#' in s:

s = s[:s.index('#')]

s = [i.strip() for i in s.split(';')]

1075

if len(s) < 2 or s[1] not in qc_order:

1076

continue

1077

quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No

1078

quickcheck_shift = qc_order.index(s[1])*2

1079

quickcheck <<= quickcheck_shift

1080

if '..' not in s[0]:

1081

first = last = int(s[0], 16)

1082

else:

1083

first, last = [int(c, 16) for c in s[0].split('..')]

1084

for char in range(first, last+1):

1085

assert not (quickchecks[char]>>quickcheck_shift)&3

1086

quickchecks[char] |= quickcheck

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1087

for i in range(0, 0x110000):

1088

if table[i] is not None:

1089

table[i].append(quickchecks[i])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

1090

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1091

with open_data(UNIHAN, version) as file:

1092

zip = zipfile.ZipFile(file)

1093

if version == '3.2.0':

1094

data = zip.open('Unihan-3.2.0.txt').read()

1095

else:

1096

data = zip.open('Unihan_NumericValues.txt').read()

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1097

for line in data.decode("utf-8").splitlines():

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

1098

if not line.startswith('U+'):

1099

continue

1100

code, tag, value = line.split(None, 3)[:3]

1101

if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',

1102

'kOtherNumeric'):

1103

continue

1104

value = value.strip().replace(',', '')

1105

i = int(code[2:], 16)

1106

# Patch the numeric field

1107

if table[i] is not None:

1108

table[i][8] = value

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

1109

sc = self.special_casing = {}

1110

with open_data(SPECIAL_CASING, version) as file:

1111

for s in file:

1112

s = s[:-1].split('#', 1)[0]

if not s:

continue

data = s.split("; ")

if data[4]:

# We ignore all conditionals (since they depend on

1118

# languages) except for one, which is hardcoded. See

1119

# handle_capital_sigma in unicodeobject.c.

1120

continue

1121

c = int(data[0], 16)

1122

lower = [int(char, 16) for char in data[1].split()]

1123

title = [int(char, 16) for char in data[2].split()]

1124

upper = [int(char, 16) for char in data[3].split()]

1125

sc[c] = (lower, title, upper)

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

1126

cf = self.case_folding = {}

1127

if version != '3.2.0':

1128

with open_data(CASE_FOLDING, version) as file:

1129

for s in file:

1130

s = s[:-1].split('#', 1)[0]

if not s:

continue

data = s.split("; ")

if data[1] in "CF":

c = int(data[0], 16)

cf[c] = [int(char, 16) for char in data[2].split()]

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

1137

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1138

def uselatin1(self):

1139

# restrict character range to ISO Latin 1

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

1140

self.chars = list(range(256))

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1141

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1142

# hash table tools

1143

1144

# this is a straight-forward reimplementation of Python's built-in

1145

# dictionary type, using a static data structure, and a custom string

1146

# hash algorithm.

1147

1148

def myhash(s, magic):

1149

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1150

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1151

h = (h * magic) + c

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

1152

ix = h & 0xff000000

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1153

if ix:

1154

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

return h

SIZES = [

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

1159

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

1160

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

1161

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

]

class Hash:

def __init__(self, name, data, magic):

1166

# turn a (key, value) list into a static hash table structure

1167

1168

# determine table size

1169

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

Ezio Melotti

1392500

2011-03-16 11:05:33 +0200

[diff] [blame]

1174

raise AssertionError("ran out of polynomials")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1175

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1176

print(size, "slots in hash table")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1177

1178

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

1187

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

incr = (h ^ (h >> 3)) & mask;

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1208

print(n, "collisions")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1209

self.collisions = n

1210

1211

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1221

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1222

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1223

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1224

file.write("#define %s_magic %d\n" % (self.name, self.magic))

1225

file.write("#define %s_size %d\n" % (self.name, self.size))

1226

file.write("#define %s_poly %d\n" % (self.name, self.poly))

1227

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1228

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1236

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1237

# write data to file, as a C array

1238

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1239

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1240

print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1241

file.write("static ")

1242

if size == 1:

1243

file.write("unsigned char")

1244

elif size == 2:

1245

file.write("unsigned short")

1246

else:

1247

file.write("unsigned int")

1248

file.write(" " + self.name + "[] = {\n")

1249

if self.data:

1250

s = " "

1251

for item in self.data:

1252

i = str(item) + ", "

1253

if len(s) + len(i) > 78:

file.write(s + "\n")

s = " " + i

else:

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1258

if s.strip():

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

file.write(s + "\n")

file.write("};\n\n")

def getsize(data):

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1272

def splitbins(t, trace=0):

1273

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

1274

1275

t is a sequence of ints. This function can be useful to save space if

1276

many of the ints are the same. t1 and t2 are lists of ints, and shift

1277

is an int, chosen to minimize the combined size of t1 and t2 (in C

1278

code), and where for each i in range(len(t)),

1279

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1280

where mask is a bitmask isolating the last "shift" bits.

1281

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1282

If optional arg trace is non-zero (default zero), progress info

1283

is printed to sys.stderr. The higher the value, the more info

1284

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1285

"""

1286

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1287

if trace:

1288

def dump(t1, t2, shift, bytes):

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1289

print("%d+%d bins at shift %d; %d bytes" % (

1290

len(t1), len(t2), shift, bytes), file=sys.stderr)

1291

print("Size of original table:", len(t)*getsize(t), \

1292

"bytes", file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1293

n = len(t)-1 # last valid index

1294

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

Christian Heimes

a37d4c6

2007-12-04 23:02:19 +0000

[diff] [blame]

1300

bytes = sys.maxsize # smallest total size so far

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1301

t = tuple(t) # so slices can be dict keys

1302

for shift in range(maxshift + 1):

1303

t1 = []

1304

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1305

size = 2**shift

1306

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1307

for i in range(0, len(t), size):

1308

bin = t[i:i+size]

1309

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1310

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1311

index = len(t2)

1312

bincache[bin] = index

1313

t2.extend(bin)

1314

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1315

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1316

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1317

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1318

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1319

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1320

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1321

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1322

t1, t2, shift = best

1323

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1324

print("Best:", end=' ', file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1325

dump(t1, t2, shift, bytes)

1326

if __debug__:

1327

# exhaustively verify that the decomposition is correct

1328

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

Guido van Rossum

805365e

2007-05-07 22:24:25 +0000

[diff] [blame]

1329

for i in range(len(t)):

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1330

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1331

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1332

1333

if __name__ == "__main__":

Fredrik Lundh