Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython3

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Benjamin Peterson

7c69c1c

2018-06-06 20:14:28 -0700

[diff] [blame]

4

# This script converts Unicode database files to Modules/unicodedata_db.h,

5

# Modules/unicodename_db.h, and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

6

#

7

# history:

8

# 2000-09-24 fl created (based on bits and pieces from unidb)

9

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

10

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

11

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

12

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

13

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

14

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

15

# 2002-09-11 wd use string methods

16

# 2002-10-18 mvl update to Unicode 3.2

17

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

18

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

19

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

20

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

21

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

22

# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

23

# 2011-10-21 ezio add support for name aliases and named sequences

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

24

# 2012-01 benjamin add full case mappings

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

25

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

26

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

27

#

28

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

import os

import sys

import zipfile

from textwrap import dedent

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

34

from functools import partial

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

35

36

SCRIPT = sys.argv[0]

Benjamin Peterson

7c69c1c

2018-06-06 20:14:28 -0700

[diff] [blame]

37

VERSION = "3.3"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

38

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

39

# The Unicode Database

R David Murray

7445a38

2014-10-09 17:30:33 -0400

[diff] [blame]

40

# --------------------

41

# When changing UCD version please update

42

# * Doc/library/stdtypes.rst, and

43

# * Doc/library/unicodedata.rst

R David Murray

5f16f90

2014-10-09 20:45:59 -0400

[diff] [blame]

44

# * Doc/reference/lexical_analysis.rst (two occurrences)

Benjamin Peterson

3aca40d

2019-05-08 20:59:35 -0700

[diff] [blame]

45

UNIDATA_VERSION = "12.1.0"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

46

UNICODE_DATA = "UnicodeData%s.txt"

47

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

48

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

49

UNIHAN = "Unihan%s.zip"

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

50

DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

51

DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

52

LINE_BREAK = "LineBreak%s.txt"

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

53

NAME_ALIASES = "NameAliases%s.txt"

54

NAMED_SEQUENCES = "NamedSequences%s.txt"

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

55

SPECIAL_CASING = "SpecialCasing%s.txt"

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

56

CASE_FOLDING = "CaseFolding%s.txt"

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

57

58

# Private Use Areas -- in planes 1, 15, 16

59

PUA_1 = range(0xE000, 0xF900)

60

PUA_15 = range(0xF0000, 0xFFFFE)

61

PUA_16 = range(0x100000, 0x10FFFE)

62

63

# we use this ranges of PUA_15 to store name aliases and named sequences

64

NAME_ALIASES_START = 0xF0000

Benjamin Peterson

71f660e

2012-02-20 22:24:29 -0500

[diff] [blame]

65

NAMED_SEQUENCES_START = 0xF0200

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

66

67

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

68

69

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

70

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

71

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

72

"So" ]

73

74

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

75

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

Benjamin Peterson

94d08d9

2013-10-10 17:24:45 -0400

[diff] [blame]

76

"ON", "LRI", "RLI", "FSI", "PDI" ]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

77

Hye-Shik Chang

e9ddfbb

2004-08-04 07:38:35 +0000

[diff] [blame]

78

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

79

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

80

MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]

81

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

82

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

87

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

88

SPACE_MASK = 0x20

89

TITLE_MASK = 0x40

90

UPPER_MASK = 0x80

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

91

XID_START_MASK = 0x100

92

XID_CONTINUE_MASK = 0x200

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

93

PRINTABLE_MASK = 0x400

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

94

NUMERIC_MASK = 0x800

95

CASE_IGNORABLE_MASK = 0x1000

96

CASED_MASK = 0x2000

97

EXTENDED_CASE_MASK = 0x4000

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

98

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

99

# these ranges need to match unicodedata.c:is_unified_ideograph

100

cjk_ranges = [

101

('3400', '4DB5'),

Benjamin Peterson

7c69c1c

2018-06-06 20:14:28 -0700

[diff] [blame]

102

('4E00', '9FEF'),

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

103

('20000', '2A6D6'),

104

('2A700', '2B734'),

Benjamin Peterson

4801383

2015-06-27 15:45:56 -0500

[diff] [blame]

105

('2B740', '2B81D'),

106

('2B820', '2CEA1'),

Benjamin Peterson

279a962

2017-06-22 22:31:08 -0700

[diff] [blame]

107

('2CEB0', '2EBE0'),

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

108

]

109

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

110

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

111

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

112

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

113

print("--- Reading", UNICODE_DATA % "", "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

114

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

115

unicode = UnicodeData(UNIDATA_VERSION)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

116

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

117

print(len(list(filter(None, unicode.table))), "characters")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

118

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

119

for version in old_versions:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

120

print("--- Reading", UNICODE_DATA % ("-"+version), "...")

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

121

old_unicode = UnicodeData(version, cjk_check=False)

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

122

print(len(list(filter(None, old_unicode.table))), "characters")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

123

merge_old_version(version, unicode, old_unicode)

124

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

125

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

126

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

127

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

128

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

129

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

130

# --------------------------------------------------------------------

131

# unicode character properties

132

133

def makeunicodedata(unicode, trace):

134

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

135

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

136

table = [dummy]

137

cache = {0: dummy}

138

index = [0] * len(unicode.chars)

139

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

140

FILE = "Modules/unicodedata_db.h"

141

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

142

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

143

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

144

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

145

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

146

for char in unicode.chars:

147

record = unicode.table[char]

148

if record:

149

# extract database properties

150

category = CATEGORY_NAMES.index(record[2])

151

combining = int(record[3])

152

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

153

mirrored = record[9] == "Y"

Hye-Shik Chang

e9ddfbb

2004-08-04 07:38:35 +0000

[diff] [blame]

154

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

155

normalizationquickcheck = record[17]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

156

item = (

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

157

category, combining, bidirectional, mirrored, eastasianwidth,

158

normalizationquickcheck

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

159

)

160

# add entry to index and item tables

161

i = cache.get(item)

162

if i is None:

163

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

167

# 2) decomposition data

168

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

169

decomp_data = [0]

170

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

171

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

172

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

173

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

174

comp_pairs = []

175

comp_first = [None] * len(unicode.chars)

176

comp_last = [None] * len(unicode.chars)

177

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

178

for char in unicode.chars:

179

record = unicode.table[char]

180

if record:

181

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

182

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

183

if len(decomp) > 19:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

184

raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

185

# prefix

186

if decomp[0][0] == "<":

187

prefix = decomp.pop(0)

188

else:

189

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

190

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

191

i = decomp_prefix.index(prefix)

192

except ValueError:

193

i = len(decomp_prefix)

194

decomp_prefix.append(prefix)

195

prefix = i

196

assert prefix < 256

197

# content

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

198

decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

199

# Collect NFC pairs

200

if not prefix and len(decomp) == 3 and \

201

char not in unicode.exclusions and \

202

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

207

try:

208

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

209

except ValueError:

210

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

211

decomp_data.extend(decomp)

212

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

213

else:

214

i = 0

215

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

216

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

217

f = l = 0

218

comp_first_ranges = []

219

comp_last_ranges = []

220

prev_f = prev_l = None

221

for i in unicode.chars:

222

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

228

prev_f = prev_f[0],i

229

else:

230

comp_first_ranges.append(prev_f)

231

prev_f = (i,i)

232

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

238

prev_l = prev_l[0],i

239

else:

240

comp_last_ranges.append(prev_l)

241

prev_l = (i,i)

242

comp_first_ranges.append(prev_f)

243

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

248

for f,l,char in comp_pairs:

249

f = comp_first[f]

250

l = comp_last[l]

251

comp_data[f*total_last+l] = char

252

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

253

print(len(table), "unique properties")

254

print(len(decomp_prefix), "unique decomposition prefixes")

255

print(len(decomp_data), "unique decomposition entries:", end=' ')

256

print(decomp_size, "bytes")

257

print(total_first, "first characters in NFC")

258

print(total_last, "last characters in NFC")

259

print(len(comp_pairs), "NFC pairs")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

260

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

261

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

262

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

263

with open(FILE, "w") as fp:

264

fprint = partial(print, file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

265

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

266

fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))

267

fprint()

268

fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)

269

fprint("/* a list of unique database records */")

270

fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")

271

for item in table:

272

fprint(" {%d, %d, %d, %d, %d, %d}," % item)

273

fprint("};")

274

fprint()

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

275

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

276

fprint("/* Reindexing of NFC first characters. */")

277

fprint("#define TOTAL_FIRST",total_first)

278

fprint("#define TOTAL_LAST",total_last)

279

fprint("struct reindex{int start;short count,index;};")

280

fprint("static struct reindex nfc_first[] = {")

281

for start,end in comp_first_ranges:

282

fprint(" { %d, %d, %d}," % (start,end-start,comp_first[start]))

283

fprint(" {0,0,0}")

284

fprint("};\n")

285

fprint("static struct reindex nfc_last[] = {")

286

for start,end in comp_last_ranges:

287

fprint(" { %d, %d, %d}," % (start,end-start,comp_last[start]))

288

fprint(" {0,0,0}")

289

fprint("};\n")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

290

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

291

# FIXME: <fl> the following tables could be made static, and

292

# the support code moved into unicodedatabase.c

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

293

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

294

fprint("/* string literals */")

295

fprint("const char *_PyUnicode_CategoryNames[] = {")

296

for name in CATEGORY_NAMES:

297

fprint(" \"%s\"," % name)

298

fprint(" NULL")

299

fprint("};")

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

300

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

301

fprint("const char *_PyUnicode_BidirectionalNames[] = {")

302

for name in BIDIRECTIONAL_NAMES:

303

fprint(" \"%s\"," % name)

304

fprint(" NULL")

305

fprint("};")

Hye-Shik Chang

e9ddfbb

2004-08-04 07:38:35 +0000

[diff] [blame]

306

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

307

fprint("const char *_PyUnicode_EastAsianWidthNames[] = {")

308

for name in EASTASIANWIDTH_NAMES:

309

fprint(" \"%s\"," % name)

310

fprint(" NULL")

311

fprint("};")

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

312

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

313

fprint("static const char *decomp_prefix[] = {")

314

for name in decomp_prefix:

315

fprint(" \"%s\"," % name)

316

fprint(" NULL")

317

fprint("};")

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

318

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

319

# split record index table

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

320

index1, index2, shift = splitbins(index, trace)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

321

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

322

fprint("/* index tables for the database records */")

323

fprint("#define SHIFT", shift)

324

Array("index1", index1).dump(fp, trace)

325

Array("index2", index2).dump(fp, trace)

326

327

# split decomposition index table

328

index1, index2, shift = splitbins(decomp_index, trace)

329

330

fprint("/* decomposition data */")

331

Array("decomp_data", decomp_data).dump(fp, trace)

332

333

fprint("/* index tables for the decomposition data */")

334

fprint("#define DECOMP_SHIFT", shift)

335

Array("decomp_index1", index1).dump(fp, trace)

336

Array("decomp_index2", index2).dump(fp, trace)

337

338

index, index2, shift = splitbins(comp_data, trace)

339

fprint("/* NFC pairs */")

340

fprint("#define COMP_SHIFT", shift)

341

Array("comp_index", index).dump(fp, trace)

342

Array("comp_data", index2).dump(fp, trace)

343

344

# Generate delta tables for old versions

345

for version, table, normalization in unicode.changed:

346

cversion = version.replace(".","_")

347

records = [table[0]]

348

cache = {table[0]:0}

349

index = [0] * len(table)

350

for i, record in enumerate(table):

351

try:

352

index[i] = cache[record]

353

except KeyError:

354

index[i] = cache[record] = len(records)

355

records.append(record)

356

index1, index2, shift = splitbins(index, trace)

357

fprint("static const change_record change_records_%s[] = {" % cversion)

358

for record in records:

359

fprint(" { %s }," % ", ".join(map(str,record)))

360

fprint("};")

361

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

362

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

363

fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion)

364

fprint("{")

365

fprint(" int index;")

366

fprint(" if (n >= 0x110000) index = 0;")

367

fprint(" else {")

368

fprint(" index = changes_%s_index[n>>%d];" % (cversion, shift))

369

fprint(" index = changes_%s_data[(index<<%d)+(n & %d)];" % \

370

(cversion, shift, ((1<<shift)-1)))

371

fprint(" }")

372

fprint(" return change_records_%s+index;" % cversion)

373

fprint("}\n")

374

fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion)

375

fprint("{")

376

fprint(" switch(n) {")

377

for k, v in normalization:

378

fprint(" case %s: return 0x%s;" % (hex(k), v))

379

fprint(" default: return 0;")

380

fprint(" }\n}\n")

381

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

382

383

# --------------------------------------------------------------------

384

# unicode character type tables

385

386

def makeunicodetype(unicode, trace):

387

388

FILE = "Objects/unicodetype_db.h"

389

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

390

print("--- Preparing", FILE, "...")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

391

392

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

393

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

394

table = [dummy]

395

cache = {0: dummy}

396

index = [0] * len(unicode.chars)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

397

numeric = {}

398

spaces = []

399

linebreaks = []

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

400

extra_casing = []

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

401

402

for char in unicode.chars:

403

record = unicode.table[char]

404

if record:

405

# extract database properties

406

category = record[2]

407

bidirectional = record[4]

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

408

properties = record[16]

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

409

flags = 0

410

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

411

flags |= ALPHA_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

412

if "Lowercase" in properties:

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

413

flags |= LOWER_MASK

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

414

if 'Line_Break' in properties or bidirectional == "B":

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

415

flags |= LINEBREAK_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

416

linebreaks.append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

417

if category == "Zs" or bidirectional in ("WS", "B", "S"):

418

flags |= SPACE_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

419

spaces.append(char)

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

420

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

421

flags |= TITLE_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

422

if "Uppercase" in properties:

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

423

flags |= UPPER_MASK

Benjamin Peterson

0983274

2009-03-26 17:15:46 +0000

[diff] [blame]

424

if char == ord(" ") or category[0] not in ("C", "Z"):

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

425

flags |= PRINTABLE_MASK

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

426

if "XID_Start" in properties:

427

flags |= XID_START_MASK

428

if "XID_Continue" in properties:

429

flags |= XID_CONTINUE_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

430

if "Cased" in properties:

431

flags |= CASED_MASK

432

if "Case_Ignorable" in properties:

433

flags |= CASE_IGNORABLE_MASK

434

sc = unicode.special_casing.get(char)

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

435

cf = unicode.case_folding.get(char, [char])

436

if record[12]:

437

upper = int(record[12], 16)

else:

upper = char

if record[13]:

lower = int(record[13], 16)

else:

lower = char

if record[14]:

title = int(record[14], 16)

446

else:

447

title = upper

448

if sc is None and cf != [lower]:

449

sc = ([lower], [title], [upper])

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

450

if sc is None:

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

451

if upper == lower == title:

452

upper = lower = title = 0

Benjamin Peterson

ad9c569

2012-01-15 21:19:20 -0500

[diff] [blame]

else:

upper = upper - char

lower = lower - char

title = title - char

assert (abs(upper) <= 2147483647 and

458

abs(lower) <= 2147483647 and

459

abs(title) <= 2147483647)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

460

else:

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

461

# This happens either when some character maps to more than one

462

# character in uppercase, lowercase, or titlecase or the

463

# casefolded version of the character is different from the

464

# lowercase. The extra characters are stored in a different

465

# array.

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

466

flags |= EXTENDED_CASE_MASK

467

lower = len(extra_casing) | (len(sc[0]) << 24)

468

extra_casing.extend(sc[0])

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

469

if cf != sc[0]:

470

lower |= len(cf) << 20

471

extra_casing.extend(cf)

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

472

upper = len(extra_casing) | (len(sc[2]) << 24)

473

extra_casing.extend(sc[2])

474

# Title is probably equal to upper.

if sc[1] == sc[2]:

title = upper

else:

title = len(extra_casing) | (len(sc[1]) << 24)

479

extra_casing.extend(sc[1])

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

480

# decimal digit, integer digit

481

decimal = 0

482

if record[6]:

483

flags |= DECIMAL_MASK

484

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

489

if record[8]:

490

flags |= NUMERIC_MASK

491

numeric.setdefault(record[8], []).append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

492

item = (

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

493

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

494

)

495

# add entry to index and item tables

496

i = cache.get(item)

497

if i is None:

498

cache[item] = i = len(table)

table.append(item)

index[char] = i

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

502

print(len(table), "unique character type entries")

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

503

print(sum(map(len, numeric.values())), "numeric code points")

504

print(len(spaces), "whitespace code points")

505

print(len(linebreaks), "linebreak code points")

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

506

print(len(extra_casing), "extended case array")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

507

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

508

print("--- Writing", FILE, "...")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

509

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

510

with open(FILE, "w") as fp:

511

fprint = partial(print, file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

512

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

513

fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))

514

fprint()

515

fprint("/* a list of unique character type descriptors */")

516

fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")

517

for item in table:

518

fprint(" {%d, %d, %d, %d, %d, %d}," % item)

519

fprint("};")

520

fprint()

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

521

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

522

fprint("/* extended case mappings */")

523

fprint()

524

fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {")

525

for c in extra_casing:

526

fprint(" %d," % c)

527

fprint("};")

528

fprint()

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

529

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

530

# split decomposition index table

531

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

532

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

533

fprint("/* type indexes */")

534

fprint("#define SHIFT", shift)

535

Array("index1", index1).dump(fp, trace)

536

Array("index2", index2).dump(fp, trace)

Amaury Forgeot d'Arc

919765a

2009-10-13 23:18:53 +0000

[diff] [blame]

537

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

538

# Generate code for _PyUnicode_ToNumeric()

539

numeric_items = sorted(numeric.items())

540

fprint('/* Returns the numeric value as double for Unicode characters')

541

fprint(' * having this property, -1.0 otherwise.')

542

fprint(' */')

543

fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)')

544

fprint('{')

545

fprint(' switch (ch) {')

546

for value, codepoints in numeric_items:

547

# Turn text into float literals

548

parts = value.split('/')

549

parts = [repr(float(part)) for part in parts]

550

value = '/'.join(parts)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

551

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

552

codepoints.sort()

553

for codepoint in codepoints:

554

fprint(' case 0x%04X:' % (codepoint,))

555

fprint(' return (double) %s;' % (value,))

556

fprint(' }')

557

fprint(' return -1.0;')

558

fprint('}')

559

fprint()

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

560

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

561

# Generate code for _PyUnicode_IsWhitespace()

562

fprint("/* Returns 1 for Unicode characters having the bidirectional")

563

fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.")

564

fprint(" */")

565

fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)')

566

fprint('{')

567

fprint(' switch (ch) {')

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

568

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

569

for codepoint in sorted(spaces):

570

fprint(' case 0x%04X:' % (codepoint,))

571

fprint(' return 1;')

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

572

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

fprint(' }')

fprint(' return 0;')

fprint('}')

fprint()

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

577

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

578

# Generate code for _PyUnicode_IsLinebreak()

579

fprint("/* Returns 1 for Unicode characters having the line break")

580

fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional")

581

fprint(" * type 'B', 0 otherwise.")

582

fprint(" */")

583

fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)')

584

fprint('{')

585

fprint(' switch (ch) {')

586

for codepoint in sorted(linebreaks):

587

fprint(' case 0x%04X:' % (codepoint,))

588

fprint(' return 1;')

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

589

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

fprint(' }')

fprint(' return 0;')

fprint('}')

fprint()

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

595

596

# --------------------------------------------------------------------

597

# unicode name database

598

599

def makeunicodename(unicode, trace):

600

601

FILE = "Modules/unicodename_db.h"

602

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

603

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

604

605

# collect names

606

names = [None] * len(unicode.chars)

607

608

for char in unicode.chars:

609

record = unicode.table[char]

610

if record:

611

name = record[1].strip()

612

if name and name[0] != "<":

613

names[char] = name + chr(0)

614

Jon Dufresne

3972628

2017-05-18 07:35:54 -0700

[diff] [blame]

615

print(len([n for n in names if n is not None]), "distinct names")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

616

617

# collect unique words from names (note that we differ between

618

# words inside a sentence, and words ending a sentence. the

619

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

635

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

636

print(n, "words in text;", b, "bytes")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

637

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

638

wordlist = list(words.items())

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

639

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

640

# sort on falling frequency, then by name

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

641

def word_key(a):

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

642

aword, alist = a

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

643

return -len(alist), aword

644

wordlist.sort(key=word_key)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

645

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

646

# figure out how many phrasebook escapes we need

647

escapes = 0

648

while escapes * 256 < len(wordlist):

649

escapes = escapes + 1

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

650

print(escapes, "escapes")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

651

652

short = 256 - escapes

assert short > 0

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

656

print(short, "short indexes in lexicon")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

657

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

658

# statistics

659

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

660

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

661

n = n + len(wordlist[i][1])

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

662

print(n, "short indexes in phrasebook")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

663

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

664

# pick the most commonly used words, and sort the rest on falling

665

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

666

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

667

wordlist, wordtail = wordlist[:short], wordlist[short:]

Raymond Hettinger

d4cb56d

2008-01-30 02:55:10 +0000

[diff] [blame]

668

wordtail.sort(key=lambda a: a[0], reverse=True)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

669

wordlist.extend(wordtail)

670

671

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

678

offset = 0

679

for w, x in wordlist:

680

# encoding: bit 7 indicates last character in word (chr(128)

681

# indicates the last character in an entire string)

682

ww = w[:-1] + chr(ord(w[-1])+128)

683

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

684

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

685

if o < 0:

686

o = offset

687

lexicon = lexicon + ww

688

offset = offset + len(w)

689

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

690

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

691

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

692

lexicon = list(map(ord, lexicon))

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

693

694

# generate phrasebook from names and lexicon

695

phrasebook = [0]

696

phrasebook_offset = [0] * len(unicode.chars)

697

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

702

for w in w:

703

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

704

if i < short:

705

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

706

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

707

# store as two bytes

708

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

709

phrasebook.append(i&255)

710

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

711

assert getsize(phrasebook) == 1

712

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

713

#

714

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

719

record = unicode.table[char]

720

if record:

721

name = record[1].strip()

722

if name and name[0] != "<":

723

data.append((name, char))

724

725

# the magic number 47 was chosen to minimize the number of

726

# collisions on the current data set. if you like, change it

727

# and see what happens...

728

729

codehash = Hash("code", data, 47)

730

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

731

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

732

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

733

with open(FILE, "w") as fp:

734

fprint = partial(print, file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

735

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

736

fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))

737

fprint()

738

fprint("#define NAME_MAXLEN", 256)

739

fprint()

740

fprint("/* lexicon */")

741

Array("lexicon", lexicon).dump(fp, trace)

742

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

743

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

744

# split decomposition index table

745

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

746

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

747

fprint("/* code->name phrasebook */")

748

fprint("#define phrasebook_shift", shift)

749

fprint("#define phrasebook_short", short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

750

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

751

Array("phrasebook", phrasebook).dump(fp, trace)

752

Array("phrasebook_offset1", offset1).dump(fp, trace)

753

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

754

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

755

fprint("/* name->code dictionary */")

756

codehash.dump(fp, trace)

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

757

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

758

fprint()

759

fprint('static const unsigned int aliases_start = %#x;' %

760

NAME_ALIASES_START)

761

fprint('static const unsigned int aliases_end = %#x;' %

762

(NAME_ALIASES_START + len(unicode.aliases)))

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

763

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

764

fprint('static const unsigned int name_aliases[] = {')

765

for name, codepoint in unicode.aliases:

766

fprint(' 0x%04X,' % codepoint)

767

fprint('};')

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

768

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

769

# In Unicode 6.0.0, the sequences contain at most 4 BMP chars,

770

# so we are using Py_UCS2 seq[4]. This needs to be updated if longer

771

# sequences or sequences with non-BMP chars are added.

772

# unicodedata_lookup should be adapted too.

773

fprint(dedent("""

774

typedef struct NamedSequence {

int seqlen;

Py_UCS2 seq[4];

} named_sequence;

"""))

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

779

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

780

fprint('static const unsigned int named_sequences_start = %#x;' %

781

NAMED_SEQUENCES_START)

782

fprint('static const unsigned int named_sequences_end = %#x;' %

783

(NAMED_SEQUENCES_START + len(unicode.named_sequences)))

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

784

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

785

fprint('static const named_sequence named_sequences[] = {')

786

for name, sequence in unicode.named_sequences:

787

seq_str = ', '.join('0x%04X' % cp for cp in sequence)

788

fprint(' {%d, {%s}},' % (len(sequence), seq_str))

789

fprint('};')

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

790

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

791

792

def merge_old_version(version, new, old):

793

# Changes to exclusion file not implemented yet

794

if old.exclusions != new.exclusions:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

795

raise NotImplementedError("exclusions differ")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

796

797

# In these change records, 0xFF means "no change"

798

bidir_changes = [0xFF]*0x110000

799

category_changes = [0xFF]*0x110000

800

decimal_changes = [0xFF]*0x110000

Martin v. Löwis

93cbca3

2008-09-10 14:08:48 +0000

[diff] [blame]

801

mirrored_changes = [0xFF]*0x110000

Benjamin Peterson

6775231

2016-09-14 23:53:47 -0700

[diff] [blame]

802

east_asian_width_changes = [0xFF]*0x110000

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

803

# In numeric data, 0 means "no change",

804

# -1 means "did not have a numeric value

805

numeric_changes = [0] * 0x110000

806

# normalization_changes is a list of key-value pairs

807

normalization_changes = []

808

for i in range(0x110000):

809

if new.table[i] is None:

810

# Characters unassigned in the new version ought to

811

# be unassigned in the old one

812

assert old.table[i] is None

813

continue

814

# check characters unassigned in the old version

815

if old.table[i] is None:

816

# category 0 is "unassigned"

817

category_changes[i] = 0

818

continue

819

# check characters that differ

820

if old.table[i] != new.table[i]:

821

for k in range(len(old.table[i])):

822

if old.table[i][k] != new.table[i][k]:

823

value = old.table[i][k]

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

824

if k == 1 and i in PUA_15:

825

# the name is not set in the old.table, but in the

826

# new.table we are using it for aliases and named seq

827

assert value == ''

828

elif k == 2:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

829

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

830

category_changes[i] = CATEGORY_NAMES.index(value)

831

elif k == 4:

832

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

833

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

834

elif k == 5:

835

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

836

# We assume that all normalization changes are in 1:1 mappings

837

assert " " not in value

838

normalization_changes.append((i, value))

839

elif k == 6:

840

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

841

# we only support changes where the old value is a single digit

842

assert value in "0123456789"

843

decimal_changes[i] = int(value)

844

elif k == 8:

845

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

846

# Since 0 encodes "no change", the old value is better not 0

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

847

if not value:

848

numeric_changes[i] = -1

849

else:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

850

numeric_changes[i] = float(value)

851

assert numeric_changes[i] not in (0, -1)

Martin v. Löwis

93cbca3

2008-09-10 14:08:48 +0000

[diff] [blame]

852

elif k == 9:

853

if value == 'Y':

854

mirrored_changes[i] = '1'

855

else:

856

mirrored_changes[i] = '0'

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

857

elif k == 11:

858

# change to ISO comment, ignore

859

pass

860

elif k == 12:

861

# change to simple uppercase mapping; ignore

862

pass

863

elif k == 13:

864

# change to simple lowercase mapping; ignore

865

pass

866

elif k == 14:

867

# change to simple titlecase mapping; ignore

868

pass

Benjamin Peterson

6775231

2016-09-14 23:53:47 -0700

[diff] [blame]

869

elif k == 15:

870

# change to east asian width

871

east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

872

elif k == 16:

873

# derived property changes; not yet

874

pass

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

875

elif k == 17:

876

# normalization quickchecks are not performed

877

# for older versions

878

pass

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

879

else:

880

class Difference(Exception):pass

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

881

raise Difference(hex(i), k, old.table[i], new.table[i])

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

882

new.changed.append((version, list(zip(bidir_changes, category_changes,

Benjamin Peterson

6775231

2016-09-14 23:53:47 -0700

[diff] [blame]

883

decimal_changes, mirrored_changes,

884

east_asian_width_changes,

885

numeric_changes)),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

886

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

887

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

888

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

889

def open_data(template, version):

890

local = template % ('-'+version,)

891

if not os.path.exists(local):

892

import urllib.request

893

if version == '3.2.0':

894

# irregular url structure

895

url = 'http://www.unicode.org/Public/3.2-Update/' + local

896

else:

897

url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')

898

urllib.request.urlretrieve(url, filename=local)

899

if local.endswith('.txt'):

900

return open(local, encoding='utf-8')

901

else:

902

# Unihan.zip

903

return open(local, 'rb')

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

904

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

905

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

906

# --------------------------------------------------------------------

907

# the following support code is taken from the unidb utilities

908

909

910

# load a unicode-data file from disk

911

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

912

class UnicodeData:

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

913

# Record structure:

914

# [ID, name, category, combining, bidi, decomp, (6)

915

# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)

916

# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)

917

# derived-props] (17)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

918

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

919

def __init__(self, version,

920

linebreakprops=False,

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

921

expand=1,

922

cjk_check=True):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

923

self.changed = []

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

924

table = [None] * 0x110000

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

925

with open_data(UNICODE_DATA, version) as file:

while 1:

s = file.readline()

if not s:

break

s = s.strip().split(";")

931

char = int(s[0], 16)

932

table[char] = s

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

933

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

934

cjk_ranges_found = []

935

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

936

# expand first-last ranges

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

937

if expand:

938

field = None

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

939

for i in range(0, 0x110000):

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

940

s = table[i]

941

if s:

942

if s[1][-6:] == "First>":

943

s[1] = ""

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

944

field = s

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

945

elif s[1][-5:] == "Last>":

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

946

if s[1].startswith("<CJK Ideograph"):

947

cjk_ranges_found.append((field[0],

948

s[0]))

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

949

s[1] = ""

950

field = None

951

elif field:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

952

f2 = field[:]

953

f2[0] = "%X" % i

954

table[i] = f2

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

955

if cjk_check and cjk_ranges != cjk_ranges_found:

956

raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

957

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

958

# public attributes

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

959

self.filename = UNICODE_DATA % ''

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

960

self.table = table

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

961

self.chars = list(range(0x110000)) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

962

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

963

# check for name aliases and named sequences, see #12753

964

# aliases and named sequences are not in 3.2.0

965

if version != '3.2.0':

966

self.aliases = []

967

# store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,

968

# in order to take advantage of the compression and lookup

969

# algorithms used for the other characters

970

pua_index = NAME_ALIASES_START

971

with open_data(NAME_ALIASES, version) as file:

972

for s in file:

973

s = s.strip()

974

if not s or s.startswith('#'):

975

continue

Benjamin Peterson

71f660e

2012-02-20 22:24:29 -0500

[diff] [blame]

976

char, name, abbrev = s.split(';')

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

977

char = int(char, 16)

978

self.aliases.append((name, char))

979

# also store the name in the PUA 1

980

self.table[pua_index][1] = name

981

pua_index += 1

982

assert pua_index - NAME_ALIASES_START == len(self.aliases)

983

984

self.named_sequences = []

Ezio Melotti

7c4a7e6

2013-08-26 01:32:56 +0300

[diff] [blame]

985

# store named sequences in the PUA 1, in range U+F0100..,

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

986

# in order to take advantage of the compression and lookup

987

# algorithms used for the other characters.

988

Benjamin Peterson

71f660e

2012-02-20 22:24:29 -0500

[diff] [blame]

989

assert pua_index < NAMED_SEQUENCES_START

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

990

pua_index = NAMED_SEQUENCES_START

991

with open_data(NAMED_SEQUENCES, version) as file:

992

for s in file:

993

s = s.strip()

994

if not s or s.startswith('#'):

995

continue

996

name, chars = s.split(';')

997

chars = tuple(int(char, 16) for char in chars.split())

998

# check that the structure defined in makeunicodename is OK

999

assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"

1000

assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "

1001

"the NamedSequence struct and in unicodedata_lookup")

1002

self.named_sequences.append((name, chars))

1003

# also store these in the PUA 1

1004

self.table[pua_index][1] = name

1005

pua_index += 1

1006

assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)

1007

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

1008

self.exclusions = {}

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1009

with open_data(COMPOSITION_EXCLUSIONS, version) as file:

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

char = int(s.split()[0],16)

1017

self.exclusions[char] = 1

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

1018

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

1019

widths = [None] * 0x110000

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1020

with open_data(EASTASIAN_WIDTH, version) as file:

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

s = s.split()[0].split(';')

1028

if '..' in s[0]:

1029

first, last = [int(c, 16) for c in s[0].split('..')]

1030

chars = list(range(first, last+1))

1031

else:

1032

chars = [int(s[0], 16)]

for char in chars:

widths[char] = s[1]

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

1036

for i in range(0, 0x110000):

1037

if table[i] is not None:

1038

table[i].append(widths[i])

1039

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1040

for i in range(0, 0x110000):

1041

if table[i] is not None:

1042

table[i].append(set())

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1043

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1044

with open_data(DERIVED_CORE_PROPERTIES, version) as file:

1045

for s in file:

1046

s = s.split('#', 1)[0].strip()

1047

if not s:

1048

continue

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1049

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

r, p = s.split(";")

r = r.strip()

p = p.strip()

if ".." in r:

first, last = [int(c, 16) for c in r.split('..')]

1055

chars = list(range(first, last+1))

else:

chars = [int(r, 16)]

for char in chars:

if table[char]:

# Some properties (e.g. Default_Ignorable_Code_Point)

1061

# apply to unassigned code points; ignore them

1062

table[char][-1].add(p)

1063

1064

with open_data(LINE_BREAK, version) as file:

1065

for s in file:

1066

s = s.partition('#')[0]

1067

s = [i.strip() for i in s.split(';')]

1068

if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:

1069

continue

1070

if '..' not in s[0]:

1071

first = last = int(s[0], 16)

1072

else:

1073

first, last = [int(c, 16) for c in s[0].split('..')]

1074

for char in range(first, last+1):

1075

table[char][-1].add('Line_Break')

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

1076

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1077

# We only want the quickcheck properties

1078

# Format: NF?_QC; Y(es)/N(o)/M(aybe)

1079

# Yes is the default, hence only N and M occur

1080

# In 3.2.0, the format was different (NF?_NO)

1081

# The parsing will incorrectly determine these as

1082

# "yes", however, unicodedata.c will not perform quickchecks

1083

# for older versions, and no delta records will be created.

1084

quickchecks = [0] * 0x110000

1085

qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1086

with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:

for s in file:

if '#' in s:

s = s[:s.index('#')]

s = [i.strip() for i in s.split(';')]

1091

if len(s) < 2 or s[1] not in qc_order:

1092

continue

1093

quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No

1094

quickcheck_shift = qc_order.index(s[1])*2

1095

quickcheck <<= quickcheck_shift

1096

if '..' not in s[0]:

1097

first = last = int(s[0], 16)

1098

else:

1099

first, last = [int(c, 16) for c in s[0].split('..')]

1100

for char in range(first, last+1):

1101

assert not (quickchecks[char]>>quickcheck_shift)&3

1102

quickchecks[char] |= quickcheck

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1103

for i in range(0, 0x110000):

1104

if table[i] is not None:

1105

table[i].append(quickchecks[i])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

1106

Ezio Melotti

2011-09-30 08:46:25 +0300

[diff] [blame]

1107

with open_data(UNIHAN, version) as file:

1108

zip = zipfile.ZipFile(file)

1109

if version == '3.2.0':

1110

data = zip.open('Unihan-3.2.0.txt').read()

1111

else:

1112

data = zip.open('Unihan_NumericValues.txt').read()

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1113

for line in data.decode("utf-8").splitlines():

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

1114

if not line.startswith('U+'):

1115

continue

1116

code, tag, value = line.split(None, 3)[:3]

1117

if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',

1118

'kOtherNumeric'):

1119

continue

1120

value = value.strip().replace(',', '')

1121

i = int(code[2:], 16)

1122

# Patch the numeric field

1123

if table[i] is not None:

1124

table[i][8] = value

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

1125

sc = self.special_casing = {}

1126

with open_data(SPECIAL_CASING, version) as file:

1127

for s in file:

1128

s = s[:-1].split('#', 1)[0]

if not s:

continue

data = s.split("; ")

if data[4]:

# We ignore all conditionals (since they depend on

1134

# languages) except for one, which is hardcoded. See

1135

# handle_capital_sigma in unicodeobject.c.

1136

continue

1137

c = int(data[0], 16)

1138

lower = [int(char, 16) for char in data[1].split()]

1139

title = [int(char, 16) for char in data[2].split()]

1140

upper = [int(char, 16) for char in data[3].split()]

1141

sc[c] = (lower, title, upper)

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

1142

cf = self.case_folding = {}

1143

if version != '3.2.0':

1144

with open_data(CASE_FOLDING, version) as file:

1145

for s in file:

1146

s = s[:-1].split('#', 1)[0]

if not s:

continue

data = s.split("; ")

if data[1] in "CF":

c = int(data[0], 16)

cf[c] = [int(char, 16) for char in data[2].split()]

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

1153

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1154

def uselatin1(self):

1155

# restrict character range to ISO Latin 1

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

1156

self.chars = list(range(256))

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1157

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

1158

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1159

# hash table tools

1160

1161

# this is a straight-forward reimplementation of Python's built-in

1162

# dictionary type, using a static data structure, and a custom string

1163

# hash algorithm.

1164

1165

def myhash(s, magic):

1166

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1167

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1168

h = (h * magic) + c

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

1169

ix = h & 0xff000000

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1170

if ix:

1171

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

1172

return h

1173

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

1174

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1175

SIZES = [

1176

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

1177

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

1178

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

1179

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

1180

]

1181

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

1182

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1183

class Hash:

1184

def __init__(self, name, data, magic):

1185

# turn a (key, value) list into a static hash table structure

1186

1187

# determine table size

1188

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

Ezio Melotti

1392500

2011-03-16 11:05:33 +0200

[diff] [blame]

1193

raise AssertionError("ran out of polynomials")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1194

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1195

print(size, "slots in hash table")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1196

1197

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

1206

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

1213

incr = (h ^ (h >> 3)) & mask

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1227

print(n, "collisions")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1228

self.collisions = n

1229

1230

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1240

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1241

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1242

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1243

file.write("#define %s_magic %d\n" % (self.name, self.magic))

1244

file.write("#define %s_size %d\n" % (self.name, self.size))

1245

file.write("#define %s_poly %d\n" % (self.name, self.poly))

1246

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

1247

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1248

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1256

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1257

# write data to file, as a C array

1258

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1259

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1260

print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)

Inada Naoki

6fec905

2019-04-17 08:40:34 +0900

[diff] [blame]

1261

file.write("static const ")

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1262

if size == 1:

1263

file.write("unsigned char")

1264

elif size == 2:

1265

file.write("unsigned short")

1266

else:

1267

file.write("unsigned int")

1268

file.write(" " + self.name + "[] = {\n")

1269

if self.data:

1270

s = " "

1271

for item in self.data:

1272

i = str(item) + ", "

1273

if len(s) + len(i) > 78:

Benjamin Peterson

279a962

2017-06-22 22:31:08 -0700

[diff] [blame]

1274

file.write(s.rstrip() + "\n")

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1275

s = " " + i

1276

else:

1277

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1278

if s.strip():

Benjamin Peterson

279a962

2017-06-22 22:31:08 -0700

[diff] [blame]

1279

file.write(s.rstrip() + "\n")

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1280

file.write("};\n\n")

1281

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

1282

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1283

def getsize(data):

1284

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

1293

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1294

def splitbins(t, trace=0):

1295

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

1296

1297

t is a sequence of ints. This function can be useful to save space if

1298

many of the ints are the same. t1 and t2 are lists of ints, and shift

1299

is an int, chosen to minimize the combined size of t1 and t2 (in C

1300

code), and where for each i in range(len(t)),

1301

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1302

where mask is a bitmask isolating the last "shift" bits.

1303

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1304

If optional arg trace is non-zero (default zero), progress info

1305

is printed to sys.stderr. The higher the value, the more info

1306

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1307

"""

1308

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1309

if trace:

1310

def dump(t1, t2, shift, bytes):

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1311

print("%d+%d bins at shift %d; %d bytes" % (

1312

len(t1), len(t2), shift, bytes), file=sys.stderr)

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

1313

print("Size of original table:", len(t)*getsize(t), "bytes",

1314

file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1315

n = len(t)-1 # last valid index

1316

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

Christian Heimes

a37d4c6

2007-12-04 23:02:19 +0000

[diff] [blame]

1322

bytes = sys.maxsize # smallest total size so far

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1323

t = tuple(t) # so slices can be dict keys

1324

for shift in range(maxshift + 1):

1325

t1 = []

1326

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1327

size = 2**shift

1328

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1329

for i in range(0, len(t), size):

1330

bin = t[i:i+size]

1331

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1332

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1333

index = len(t2)

1334

bincache[bin] = index

1335

t2.extend(bin)

1336

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1337

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1338

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1339

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1340

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1341

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1342

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1343

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1344

t1, t2, shift = best

1345

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1346

print("Best:", end=' ', file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1347

dump(t1, t2, shift, bytes)

1348

if __debug__:

1349

# exhaustively verify that the decomposition is correct

1350

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

Guido van Rossum

805365e

2007-05-07 22:24:25 +0000

[diff] [blame]

1351

for i in range(len(t)):

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1352

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1353

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1354

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame^]

1355

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1356

if __name__ == "__main__":

Fredrik Lundh