Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython3

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Benjamin Peterson

7c69c1c

2018-06-06 20:14:28 -0700

[diff] [blame]

4

# This script converts Unicode database files to Modules/unicodedata_db.h,

5

# Modules/unicodename_db.h, and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

6

#

7

# history:

8

# 2000-09-24 fl created (based on bits and pieces from unidb)

9

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

10

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

11

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

12

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

13

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

14

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

15

# 2002-09-11 wd use string methods

16

# 2002-10-18 mvl update to Unicode 3.2

17

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

18

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

19

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

20

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

21

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

22

# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

23

# 2011-10-21 ezio add support for name aliases and named sequences

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

24

# 2012-01 benjamin add full case mappings

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

25

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

26

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

27

#

28

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

import os

import sys

import zipfile

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

33

from functools import partial

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame^]

34

from textwrap import dedent

35

from typing import *

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

36

37

SCRIPT = sys.argv[0]

Benjamin Peterson

7c69c1c

2018-06-06 20:14:28 -0700

[diff] [blame]

38

VERSION = "3.3"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

39

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

40

# The Unicode Database

R David Murray

7445a38

2014-10-09 17:30:33 -0400

[diff] [blame]

41

# --------------------

42

# When changing UCD version please update

43

# * Doc/library/stdtypes.rst, and

44

# * Doc/library/unicodedata.rst

R David Murray

5f16f90

2014-10-09 20:45:59 -0400

[diff] [blame]

45

# * Doc/reference/lexical_analysis.rst (two occurrences)

Benjamin Peterson

3aca40d

2019-05-08 20:59:35 -0700

[diff] [blame]

46

UNIDATA_VERSION = "12.1.0"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

47

UNICODE_DATA = "UnicodeData%s.txt"

48

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

49

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

50

UNIHAN = "Unihan%s.zip"

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

51

DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

52

DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

53

LINE_BREAK = "LineBreak%s.txt"

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

54

NAME_ALIASES = "NameAliases%s.txt"

55

NAMED_SEQUENCES = "NamedSequences%s.txt"

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

56

SPECIAL_CASING = "SpecialCasing%s.txt"

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

57

CASE_FOLDING = "CaseFolding%s.txt"

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

58

59

# Private Use Areas -- in planes 1, 15, 16

60

PUA_1 = range(0xE000, 0xF900)

61

PUA_15 = range(0xF0000, 0xFFFFE)

62

PUA_16 = range(0x100000, 0x10FFFE)

63

64

# we use this ranges of PUA_15 to store name aliases and named sequences

65

NAME_ALIASES_START = 0xF0000

Benjamin Peterson

71f660e

2012-02-20 22:24:29 -0500

[diff] [blame]

66

NAMED_SEQUENCES_START = 0xF0200

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

67

68

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

69

70

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

71

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

72

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

73

"So" ]

74

75

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

76

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

Benjamin Peterson

94d08d9

2013-10-10 17:24:45 -0400

[diff] [blame]

77

"ON", "LRI", "RLI", "FSI", "PDI" ]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

78

Hye-Shik Chang

e9ddfbb

2004-08-04 07:38:35 +0000

[diff] [blame]

79

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

80

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

81

MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]

82

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

83

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

88

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

89

SPACE_MASK = 0x20

90

TITLE_MASK = 0x40

91

UPPER_MASK = 0x80

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

92

XID_START_MASK = 0x100

93

XID_CONTINUE_MASK = 0x200

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

94

PRINTABLE_MASK = 0x400

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

95

NUMERIC_MASK = 0x800

96

CASE_IGNORABLE_MASK = 0x1000

97

CASED_MASK = 0x2000

98

EXTENDED_CASE_MASK = 0x4000

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

99

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

100

# these ranges need to match unicodedata.c:is_unified_ideograph

101

cjk_ranges = [

102

('3400', '4DB5'),

Benjamin Peterson

7c69c1c

2018-06-06 20:14:28 -0700

[diff] [blame]

103

('4E00', '9FEF'),

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

104

('20000', '2A6D6'),

105

('2A700', '2B734'),

Benjamin Peterson

4801383

2015-06-27 15:45:56 -0500

[diff] [blame]

106

('2B740', '2B81D'),

107

('2B820', '2CEA1'),

Benjamin Peterson

279a962

2017-06-22 22:31:08 -0700

[diff] [blame]

108

('2CEB0', '2EBE0'),

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

109

]

110

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

111

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

112

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

113

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

114

print("--- Reading", UNICODE_DATA % "", "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

115

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

116

unicode = UnicodeData(UNIDATA_VERSION)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

117

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

118

print(len(list(filter(None, unicode.table))), "characters")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

119

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

120

for version in old_versions:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

121

print("--- Reading", UNICODE_DATA % ("-"+version), "...")

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

122

old_unicode = UnicodeData(version, cjk_check=False)

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

123

print(len(list(filter(None, old_unicode.table))), "characters")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

124

merge_old_version(version, unicode, old_unicode)

125

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

126

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

127

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

128

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

129

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

130

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

131

# --------------------------------------------------------------------

132

# unicode character properties

133

134

def makeunicodedata(unicode, trace):

135

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

136

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

137

table = [dummy]

138

cache = {0: dummy}

139

index = [0] * len(unicode.chars)

140

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

141

FILE = "Modules/unicodedata_db.h"

142

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

143

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

144

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

145

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

146

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

147

for char in unicode.chars:

148

record = unicode.table[char]

149

if record:

150

# extract database properties

151

category = CATEGORY_NAMES.index(record[2])

152

combining = int(record[3])

153

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

154

mirrored = record[9] == "Y"

Hye-Shik Chang

e9ddfbb

2004-08-04 07:38:35 +0000

[diff] [blame]

155

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

156

normalizationquickcheck = record[17]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

157

item = (

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

158

category, combining, bidirectional, mirrored, eastasianwidth,

159

normalizationquickcheck

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

160

)

161

# add entry to index and item tables

162

i = cache.get(item)

163

if i is None:

164

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

168

# 2) decomposition data

169

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

170

decomp_data = [0]

171

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

172

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

173

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

174

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

175

comp_pairs = []

176

comp_first = [None] * len(unicode.chars)

177

comp_last = [None] * len(unicode.chars)

178

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

179

for char in unicode.chars:

180

record = unicode.table[char]

181

if record:

182

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

183

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

184

if len(decomp) > 19:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

185

raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

186

# prefix

187

if decomp[0][0] == "<":

188

prefix = decomp.pop(0)

189

else:

190

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

191

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

192

i = decomp_prefix.index(prefix)

193

except ValueError:

194

i = len(decomp_prefix)

195

decomp_prefix.append(prefix)

196

prefix = i

197

assert prefix < 256

198

# content

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

199

decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

200

# Collect NFC pairs

201

if not prefix and len(decomp) == 3 and \

202

char not in unicode.exclusions and \

203

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

208

try:

209

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

210

except ValueError:

211

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

212

decomp_data.extend(decomp)

213

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

214

else:

215

i = 0

216

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

217

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

218

f = l = 0

219

comp_first_ranges = []

220

comp_last_ranges = []

221

prev_f = prev_l = None

222

for i in unicode.chars:

223

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

229

prev_f = prev_f[0],i

230

else:

231

comp_first_ranges.append(prev_f)

232

prev_f = (i,i)

233

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

239

prev_l = prev_l[0],i

240

else:

241

comp_last_ranges.append(prev_l)

242

prev_l = (i,i)

243

comp_first_ranges.append(prev_f)

244

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

249

for f,l,char in comp_pairs:

250

f = comp_first[f]

251

l = comp_last[l]

252

comp_data[f*total_last+l] = char

253

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

254

print(len(table), "unique properties")

255

print(len(decomp_prefix), "unique decomposition prefixes")

256

print(len(decomp_data), "unique decomposition entries:", end=' ')

257

print(decomp_size, "bytes")

258

print(total_first, "first characters in NFC")

259

print(total_last, "last characters in NFC")

260

print(len(comp_pairs), "NFC pairs")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

261

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

262

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

263

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

264

with open(FILE, "w") as fp:

265

fprint = partial(print, file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

266

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

267

fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))

268

fprint()

269

fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)

270

fprint("/* a list of unique database records */")

271

fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")

272

for item in table:

273

fprint(" {%d, %d, %d, %d, %d, %d}," % item)

274

fprint("};")

275

fprint()

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

276

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

277

fprint("/* Reindexing of NFC first characters. */")

278

fprint("#define TOTAL_FIRST",total_first)

279

fprint("#define TOTAL_LAST",total_last)

280

fprint("struct reindex{int start;short count,index;};")

281

fprint("static struct reindex nfc_first[] = {")

282

for start,end in comp_first_ranges:

283

fprint(" { %d, %d, %d}," % (start,end-start,comp_first[start]))

284

fprint(" {0,0,0}")

285

fprint("};\n")

286

fprint("static struct reindex nfc_last[] = {")

287

for start,end in comp_last_ranges:

288

fprint(" { %d, %d, %d}," % (start,end-start,comp_last[start]))

289

fprint(" {0,0,0}")

290

fprint("};\n")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

291

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

292

# FIXME: <fl> the following tables could be made static, and

293

# the support code moved into unicodedatabase.c

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

294

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

295

fprint("/* string literals */")

296

fprint("const char *_PyUnicode_CategoryNames[] = {")

297

for name in CATEGORY_NAMES:

298

fprint(" \"%s\"," % name)

299

fprint(" NULL")

300

fprint("};")

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

301

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

302

fprint("const char *_PyUnicode_BidirectionalNames[] = {")

303

for name in BIDIRECTIONAL_NAMES:

304

fprint(" \"%s\"," % name)

305

fprint(" NULL")

306

fprint("};")

Hye-Shik Chang

e9ddfbb

2004-08-04 07:38:35 +0000

[diff] [blame]

307

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

308

fprint("const char *_PyUnicode_EastAsianWidthNames[] = {")

309

for name in EASTASIANWIDTH_NAMES:

310

fprint(" \"%s\"," % name)

311

fprint(" NULL")

312

fprint("};")

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

313

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

314

fprint("static const char *decomp_prefix[] = {")

315

for name in decomp_prefix:

316

fprint(" \"%s\"," % name)

317

fprint(" NULL")

318

fprint("};")

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

319

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

320

# split record index table

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

321

index1, index2, shift = splitbins(index, trace)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

322

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

323

fprint("/* index tables for the database records */")

324

fprint("#define SHIFT", shift)

325

Array("index1", index1).dump(fp, trace)

326

Array("index2", index2).dump(fp, trace)

327

328

# split decomposition index table

329

index1, index2, shift = splitbins(decomp_index, trace)

330

331

fprint("/* decomposition data */")

332

Array("decomp_data", decomp_data).dump(fp, trace)

333

334

fprint("/* index tables for the decomposition data */")

335

fprint("#define DECOMP_SHIFT", shift)

336

Array("decomp_index1", index1).dump(fp, trace)

337

Array("decomp_index2", index2).dump(fp, trace)

338

339

index, index2, shift = splitbins(comp_data, trace)

340

fprint("/* NFC pairs */")

341

fprint("#define COMP_SHIFT", shift)

342

Array("comp_index", index).dump(fp, trace)

343

Array("comp_data", index2).dump(fp, trace)

344

345

# Generate delta tables for old versions

346

for version, table, normalization in unicode.changed:

347

cversion = version.replace(".","_")

348

records = [table[0]]

349

cache = {table[0]:0}

350

index = [0] * len(table)

351

for i, record in enumerate(table):

352

try:

353

index[i] = cache[record]

354

except KeyError:

355

index[i] = cache[record] = len(records)

356

records.append(record)

357

index1, index2, shift = splitbins(index, trace)

358

fprint("static const change_record change_records_%s[] = {" % cversion)

359

for record in records:

360

fprint(" { %s }," % ", ".join(map(str,record)))

361

fprint("};")

362

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

363

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

364

fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion)

365

fprint("{")

366

fprint(" int index;")

367

fprint(" if (n >= 0x110000) index = 0;")

368

fprint(" else {")

369

fprint(" index = changes_%s_index[n>>%d];" % (cversion, shift))

370

fprint(" index = changes_%s_data[(index<<%d)+(n & %d)];" % \

371

(cversion, shift, ((1<<shift)-1)))

372

fprint(" }")

373

fprint(" return change_records_%s+index;" % cversion)

374

fprint("}\n")

375

fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion)

376

fprint("{")

377

fprint(" switch(n) {")

378

for k, v in normalization:

379

fprint(" case %s: return 0x%s;" % (hex(k), v))

380

fprint(" default: return 0;")

381

fprint(" }\n}\n")

382

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

383

384

# --------------------------------------------------------------------

385

# unicode character type tables

386

387

def makeunicodetype(unicode, trace):

388

389

FILE = "Objects/unicodetype_db.h"

390

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

391

print("--- Preparing", FILE, "...")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

392

393

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

394

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

395

table = [dummy]

396

cache = {0: dummy}

397

index = [0] * len(unicode.chars)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

398

numeric = {}

399

spaces = []

400

linebreaks = []

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

401

extra_casing = []

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

402

403

for char in unicode.chars:

404

record = unicode.table[char]

405

if record:

406

# extract database properties

407

category = record[2]

408

bidirectional = record[4]

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

409

properties = record[16]

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

410

flags = 0

411

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

412

flags |= ALPHA_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

413

if "Lowercase" in properties:

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

414

flags |= LOWER_MASK

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

415

if 'Line_Break' in properties or bidirectional == "B":

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

416

flags |= LINEBREAK_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

417

linebreaks.append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

418

if category == "Zs" or bidirectional in ("WS", "B", "S"):

419

flags |= SPACE_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

420

spaces.append(char)

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

421

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

422

flags |= TITLE_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

423

if "Uppercase" in properties:

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

424

flags |= UPPER_MASK

Benjamin Peterson

0983274

2009-03-26 17:15:46 +0000

[diff] [blame]

425

if char == ord(" ") or category[0] not in ("C", "Z"):

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

426

flags |= PRINTABLE_MASK

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

427

if "XID_Start" in properties:

428

flags |= XID_START_MASK

429

if "XID_Continue" in properties:

430

flags |= XID_CONTINUE_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

431

if "Cased" in properties:

432

flags |= CASED_MASK

433

if "Case_Ignorable" in properties:

434

flags |= CASE_IGNORABLE_MASK

435

sc = unicode.special_casing.get(char)

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

436

cf = unicode.case_folding.get(char, [char])

437

if record[12]:

438

upper = int(record[12], 16)

else:

upper = char

if record[13]:

lower = int(record[13], 16)

else:

lower = char

if record[14]:

title = int(record[14], 16)

447

else:

448

title = upper

449

if sc is None and cf != [lower]:

450

sc = ([lower], [title], [upper])

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

451

if sc is None:

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

452

if upper == lower == title:

453

upper = lower = title = 0

Benjamin Peterson

ad9c569

2012-01-15 21:19:20 -0500

[diff] [blame]

else:

upper = upper - char

lower = lower - char

title = title - char

assert (abs(upper) <= 2147483647 and

459

abs(lower) <= 2147483647 and

460

abs(title) <= 2147483647)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

461

else:

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

462

# This happens either when some character maps to more than one

463

# character in uppercase, lowercase, or titlecase or the

464

# casefolded version of the character is different from the

465

# lowercase. The extra characters are stored in a different

466

# array.

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

467

flags |= EXTENDED_CASE_MASK

468

lower = len(extra_casing) | (len(sc[0]) << 24)

469

extra_casing.extend(sc[0])

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

470

if cf != sc[0]:

471

lower |= len(cf) << 20

472

extra_casing.extend(cf)

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

473

upper = len(extra_casing) | (len(sc[2]) << 24)

474

extra_casing.extend(sc[2])

475

# Title is probably equal to upper.

if sc[1] == sc[2]:

title = upper

else:

title = len(extra_casing) | (len(sc[1]) << 24)

480

extra_casing.extend(sc[1])

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

481

# decimal digit, integer digit

482

decimal = 0

483

if record[6]:

484

flags |= DECIMAL_MASK

485

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

490

if record[8]:

491

flags |= NUMERIC_MASK

492

numeric.setdefault(record[8], []).append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

493

item = (

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

494

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

495

)

496

# add entry to index and item tables

497

i = cache.get(item)

498

if i is None:

499

cache[item] = i = len(table)

table.append(item)

index[char] = i

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

503

print(len(table), "unique character type entries")

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

504

print(sum(map(len, numeric.values())), "numeric code points")

505

print(len(spaces), "whitespace code points")

506

print(len(linebreaks), "linebreak code points")

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

507

print(len(extra_casing), "extended case array")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

508

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

509

print("--- Writing", FILE, "...")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

510

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

511

with open(FILE, "w") as fp:

512

fprint = partial(print, file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

513

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

514

fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))

515

fprint()

516

fprint("/* a list of unique character type descriptors */")

517

fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")

518

for item in table:

519

fprint(" {%d, %d, %d, %d, %d, %d}," % item)

520

fprint("};")

521

fprint()

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

522

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

523

fprint("/* extended case mappings */")

524

fprint()

525

fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {")

526

for c in extra_casing:

527

fprint(" %d," % c)

528

fprint("};")

529

fprint()

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

530

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

531

# split decomposition index table

532

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

533

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

534

fprint("/* type indexes */")

535

fprint("#define SHIFT", shift)

536

Array("index1", index1).dump(fp, trace)

537

Array("index2", index2).dump(fp, trace)

Amaury Forgeot d'Arc

919765a

2009-10-13 23:18:53 +0000

[diff] [blame]

538

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

539

# Generate code for _PyUnicode_ToNumeric()

540

numeric_items = sorted(numeric.items())

541

fprint('/* Returns the numeric value as double for Unicode characters')

542

fprint(' * having this property, -1.0 otherwise.')

543

fprint(' */')

544

fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)')

545

fprint('{')

546

fprint(' switch (ch) {')

547

for value, codepoints in numeric_items:

548

# Turn text into float literals

549

parts = value.split('/')

550

parts = [repr(float(part)) for part in parts]

551

value = '/'.join(parts)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

552

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

553

codepoints.sort()

554

for codepoint in codepoints:

555

fprint(' case 0x%04X:' % (codepoint,))

556

fprint(' return (double) %s;' % (value,))

557

fprint(' }')

558

fprint(' return -1.0;')

559

fprint('}')

560

fprint()

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

561

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

562

# Generate code for _PyUnicode_IsWhitespace()

563

fprint("/* Returns 1 for Unicode characters having the bidirectional")

564

fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.")

565

fprint(" */")

566

fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)')

567

fprint('{')

568

fprint(' switch (ch) {')

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

569

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

570

for codepoint in sorted(spaces):

571

fprint(' case 0x%04X:' % (codepoint,))

572

fprint(' return 1;')

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

573

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

fprint(' }')

fprint(' return 0;')

fprint('}')

fprint()

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

578

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

579

# Generate code for _PyUnicode_IsLinebreak()

580

fprint("/* Returns 1 for Unicode characters having the line break")

581

fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional")

582

fprint(" * type 'B', 0 otherwise.")

583

fprint(" */")

584

fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)')

585

fprint('{')

586

fprint(' switch (ch) {')

587

for codepoint in sorted(linebreaks):

588

fprint(' case 0x%04X:' % (codepoint,))

589

fprint(' return 1;')

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

590

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

fprint(' }')

fprint(' return 0;')

fprint('}')

fprint()

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

596

597

# --------------------------------------------------------------------

598

# unicode name database

599

600

def makeunicodename(unicode, trace):

601

602

FILE = "Modules/unicodename_db.h"

603

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

604

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

605

606

# collect names

607

names = [None] * len(unicode.chars)

608

609

for char in unicode.chars:

610

record = unicode.table[char]

611

if record:

612

name = record[1].strip()

613

if name and name[0] != "<":

614

names[char] = name + chr(0)

615

Jon Dufresne

3972628

2017-05-18 07:35:54 -0700

[diff] [blame]

616

print(len([n for n in names if n is not None]), "distinct names")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

617

618

# collect unique words from names (note that we differ between

619

# words inside a sentence, and words ending a sentence. the

620

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

636

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

637

print(n, "words in text;", b, "bytes")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

638

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

639

wordlist = list(words.items())

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

640

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

641

# sort on falling frequency, then by name

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

642

def word_key(a):

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

643

aword, alist = a

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

644

return -len(alist), aword

645

wordlist.sort(key=word_key)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

646

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

647

# figure out how many phrasebook escapes we need

648

escapes = 0

649

while escapes * 256 < len(wordlist):

650

escapes = escapes + 1

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

651

print(escapes, "escapes")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

652

653

short = 256 - escapes

assert short > 0

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

657

print(short, "short indexes in lexicon")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

658

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

659

# statistics

660

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

661

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

662

n = n + len(wordlist[i][1])

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

663

print(n, "short indexes in phrasebook")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

664

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

665

# pick the most commonly used words, and sort the rest on falling

666

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

667

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

668

wordlist, wordtail = wordlist[:short], wordlist[short:]

Raymond Hettinger

d4cb56d

2008-01-30 02:55:10 +0000

[diff] [blame]

669

wordtail.sort(key=lambda a: a[0], reverse=True)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

670

wordlist.extend(wordtail)

671

672

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

679

offset = 0

680

for w, x in wordlist:

681

# encoding: bit 7 indicates last character in word (chr(128)

682

# indicates the last character in an entire string)

683

ww = w[:-1] + chr(ord(w[-1])+128)

684

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

685

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

686

if o < 0:

687

o = offset

688

lexicon = lexicon + ww

689

offset = offset + len(w)

690

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

691

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

692

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

693

lexicon = list(map(ord, lexicon))

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

694

695

# generate phrasebook from names and lexicon

696

phrasebook = [0]

697

phrasebook_offset = [0] * len(unicode.chars)

698

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

703

for w in w:

704

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

705

if i < short:

706

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

707

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

708

# store as two bytes

709

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

710

phrasebook.append(i&255)

711

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

712

assert getsize(phrasebook) == 1

713

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

714

#

715

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

720

record = unicode.table[char]

721

if record:

722

name = record[1].strip()

723

if name and name[0] != "<":

724

data.append((name, char))

725

726

# the magic number 47 was chosen to minimize the number of

727

# collisions on the current data set. if you like, change it

728

# and see what happens...

729

730

codehash = Hash("code", data, 47)

731

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

732

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

733

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

734

with open(FILE, "w") as fp:

735

fprint = partial(print, file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

736

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

737

fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))

738

fprint()

739

fprint("#define NAME_MAXLEN", 256)

740

fprint()

741

fprint("/* lexicon */")

742

Array("lexicon", lexicon).dump(fp, trace)

743

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

744

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

745

# split decomposition index table

746

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

747

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

748

fprint("/* code->name phrasebook */")

749

fprint("#define phrasebook_shift", shift)

750

fprint("#define phrasebook_short", short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

751

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

752

Array("phrasebook", phrasebook).dump(fp, trace)

753

Array("phrasebook_offset1", offset1).dump(fp, trace)

754

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

755

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

756

fprint("/* name->code dictionary */")

757

codehash.dump(fp, trace)

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

758

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

759

fprint()

760

fprint('static const unsigned int aliases_start = %#x;' %

761

NAME_ALIASES_START)

762

fprint('static const unsigned int aliases_end = %#x;' %

763

(NAME_ALIASES_START + len(unicode.aliases)))

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

764

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

765

fprint('static const unsigned int name_aliases[] = {')

766

for name, codepoint in unicode.aliases:

767

fprint(' 0x%04X,' % codepoint)

768

fprint('};')

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

769

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

770

# In Unicode 6.0.0, the sequences contain at most 4 BMP chars,

771

# so we are using Py_UCS2 seq[4]. This needs to be updated if longer

772

# sequences or sequences with non-BMP chars are added.

773

# unicodedata_lookup should be adapted too.

774

fprint(dedent("""

775

typedef struct NamedSequence {

int seqlen;

Py_UCS2 seq[4];

} named_sequence;

"""))

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

780

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

781

fprint('static const unsigned int named_sequences_start = %#x;' %

782

NAMED_SEQUENCES_START)

783

fprint('static const unsigned int named_sequences_end = %#x;' %

784

(NAMED_SEQUENCES_START + len(unicode.named_sequences)))

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

785

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

786

fprint('static const named_sequence named_sequences[] = {')

787

for name, sequence in unicode.named_sequences:

788

seq_str = ', '.join('0x%04X' % cp for cp in sequence)

789

fprint(' {%d, {%s}},' % (len(sequence), seq_str))

790

fprint('};')

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

791

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

792

793

def merge_old_version(version, new, old):

794

# Changes to exclusion file not implemented yet

795

if old.exclusions != new.exclusions:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

796

raise NotImplementedError("exclusions differ")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

797

798

# In these change records, 0xFF means "no change"

799

bidir_changes = [0xFF]*0x110000

800

category_changes = [0xFF]*0x110000

801

decimal_changes = [0xFF]*0x110000

Martin v. Löwis

93cbca3

2008-09-10 14:08:48 +0000

[diff] [blame]

802

mirrored_changes = [0xFF]*0x110000

Benjamin Peterson

6775231

2016-09-14 23:53:47 -0700

[diff] [blame]

803

east_asian_width_changes = [0xFF]*0x110000

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

804

# In numeric data, 0 means "no change",

805

# -1 means "did not have a numeric value

806

numeric_changes = [0] * 0x110000

807

# normalization_changes is a list of key-value pairs

808

normalization_changes = []

809

for i in range(0x110000):

810

if new.table[i] is None:

811

# Characters unassigned in the new version ought to

812

# be unassigned in the old one

813

assert old.table[i] is None

814

continue

815

# check characters unassigned in the old version

816

if old.table[i] is None:

817

# category 0 is "unassigned"

818

category_changes[i] = 0

819

continue

820

# check characters that differ

821

if old.table[i] != new.table[i]:

822

for k in range(len(old.table[i])):

823

if old.table[i][k] != new.table[i][k]:

824

value = old.table[i][k]

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

825

if k == 1 and i in PUA_15:

826

# the name is not set in the old.table, but in the

827

# new.table we are using it for aliases and named seq

828

assert value == ''

829

elif k == 2:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

830

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

831

category_changes[i] = CATEGORY_NAMES.index(value)

832

elif k == 4:

833

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

834

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

835

elif k == 5:

836

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

837

# We assume that all normalization changes are in 1:1 mappings

838

assert " " not in value

839

normalization_changes.append((i, value))

840

elif k == 6:

841

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

842

# we only support changes where the old value is a single digit

843

assert value in "0123456789"

844

decimal_changes[i] = int(value)

845

elif k == 8:

846

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

847

# Since 0 encodes "no change", the old value is better not 0

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

848

if not value:

849

numeric_changes[i] = -1

850

else:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

851

numeric_changes[i] = float(value)

852

assert numeric_changes[i] not in (0, -1)

Martin v. Löwis

93cbca3

2008-09-10 14:08:48 +0000

[diff] [blame]

853

elif k == 9:

854

if value == 'Y':

855

mirrored_changes[i] = '1'

856

else:

857

mirrored_changes[i] = '0'

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

858

elif k == 11:

859

# change to ISO comment, ignore

860

pass

861

elif k == 12:

862

# change to simple uppercase mapping; ignore

863

pass

864

elif k == 13:

865

# change to simple lowercase mapping; ignore

866

pass

867

elif k == 14:

868

# change to simple titlecase mapping; ignore

869

pass

Benjamin Peterson

6775231

2016-09-14 23:53:47 -0700

[diff] [blame]

870

elif k == 15:

871

# change to east asian width

872

east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

873

elif k == 16:

874

# derived property changes; not yet

875

pass

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

876

elif k == 17:

877

# normalization quickchecks are not performed

878

# for older versions

879

pass

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

880

else:

881

class Difference(Exception):pass

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

882

raise Difference(hex(i), k, old.table[i], new.table[i])

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

883

new.changed.append((version, list(zip(bidir_changes, category_changes,

Benjamin Peterson

6775231

2016-09-14 23:53:47 -0700

[diff] [blame]

884

decimal_changes, mirrored_changes,

885

east_asian_width_changes,

886

numeric_changes)),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

887

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

888

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

889

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

890

def open_data(template, version):

891

local = template % ('-'+version,)

892

if not os.path.exists(local):

893

import urllib.request

894

if version == '3.2.0':

895

# irregular url structure

896

url = 'http://www.unicode.org/Public/3.2-Update/' + local

897

else:

898

url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')

899

urllib.request.urlretrieve(url, filename=local)

900

if local.endswith('.txt'):

901

return open(local, encoding='utf-8')

902

else:

903

# Unihan.zip

904

return open(local, 'rb')

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

905

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

906

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame^]

907

class UcdFile:

908

'''

909

A file in the standard format of the UCD.

910

911

See: https://www.unicode.org/reports/tr44/#Format_Conventions

912

913

Note that, as described there, the Unihan data files have their

own separate format.

'''

def __init__(self, template: str, version: str) -> None:

918

self.template = template

919

self.version = version

920

921

def records(self) -> Iterator[List[str]]:

922

with open_data(self.template, self.version) as file:

923

for line in file:

924

line = line.split('#', 1)[0].strip()

925

if not line:

926

continue

927

yield [field.strip() for field in line.split(';')]

928

929

def __iter__(self) -> Iterator[List[str]]:

930

return self.records()

931

932

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

933

# --------------------------------------------------------------------

934

# the following support code is taken from the unidb utilities

935

936

937

# load a unicode-data file from disk

938

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

939

class UnicodeData:

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

940

# Record structure:

941

# [ID, name, category, combining, bidi, decomp, (6)

942

# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)

943

# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)

944

# derived-props] (17)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

945

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

946

def __init__(self, version,

947

linebreakprops=False,

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

948

expand=1,

949

cjk_check=True):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

950

self.changed = []

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

951

table = [None] * 0x110000

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame^]

952

for s in UcdFile(UNICODE_DATA, version):

953

char = int(s[0], 16)

954

table[char] = s

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

955

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

956

cjk_ranges_found = []

957

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

958

# expand first-last ranges

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

959

if expand:

960

field = None

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

961

for i in range(0, 0x110000):

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

962

s = table[i]

963

if s:

964

if s[1][-6:] == "First>":

965

s[1] = ""

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

966

field = s

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

967

elif s[1][-5:] == "Last>":

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

968

if s[1].startswith("<CJK Ideograph"):

969

cjk_ranges_found.append((field[0],

970

s[0]))

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

971

s[1] = ""

972

field = None

973

elif field:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

974

f2 = field[:]

975

f2[0] = "%X" % i

976

table[i] = f2

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

977

if cjk_check and cjk_ranges != cjk_ranges_found:

978

raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

979

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

980

# public attributes

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

981

self.filename = UNICODE_DATA % ''

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

982

self.table = table

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

983

self.chars = list(range(0x110000)) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

984

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

985

# check for name aliases and named sequences, see #12753

986

# aliases and named sequences are not in 3.2.0

987

if version != '3.2.0':

988

self.aliases = []

989

# store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,

990

# in order to take advantage of the compression and lookup

991

# algorithms used for the other characters

992

pua_index = NAME_ALIASES_START

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame^]

993

for char, name, abbrev in UcdFile(NAME_ALIASES, version):

994

char = int(char, 16)

995

self.aliases.append((name, char))

996

# also store the name in the PUA 1

997

self.table[pua_index][1] = name

998

pua_index += 1

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

999

assert pua_index - NAME_ALIASES_START == len(self.aliases)

1000

1001

self.named_sequences = []

Ezio Melotti

7c4a7e6

2013-08-26 01:32:56 +0300

[diff] [blame]

1002

# store named sequences in the PUA 1, in range U+F0100..,

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

1003

# in order to take advantage of the compression and lookup

1004

# algorithms used for the other characters.

1005

Benjamin Peterson

71f660e

2012-02-20 22:24:29 -0500

[diff] [blame]

1006

assert pua_index < NAMED_SEQUENCES_START

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

1007

pua_index = NAMED_SEQUENCES_START

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame^]

1008

for name, chars in UcdFile(NAMED_SEQUENCES, version):

1009

chars = tuple(int(char, 16) for char in chars.split())

1010

# check that the structure defined in makeunicodename is OK

1011

assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"

1012

assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "

1013

"the NamedSequence struct and in unicodedata_lookup")

1014

self.named_sequences.append((name, chars))

1015

# also store these in the PUA 1

1016

self.table[pua_index][1] = name

1017

pua_index += 1

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

1018

assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)

1019

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

1020

self.exclusions = {}

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame^]

1021

for char, in UcdFile(COMPOSITION_EXCLUSIONS, version):

1022

char = int(char, 16)

1023

self.exclusions[char] = 1

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

1024

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

1025

widths = [None] * 0x110000

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame^]

1026

for s in UcdFile(EASTASIAN_WIDTH, version):

1027

if '..' in s[0]:

1028

first, last = [int(c, 16) for c in s[0].split('..')]

1029

chars = list(range(first, last+1))

1030

else:

1031

chars = [int(s[0], 16)]

1032

for char in chars:

1033

widths[char] = s[1]

Ezio Melotti

2a1e926

2011-09-30 08:46:25 +0300

[diff] [blame]

1034

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

1035

for i in range(0, 0x110000):

1036

if table[i] is not None:

1037

table[i].append(widths[i])

1038

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1039

for i in range(0, 0x110000):

1040

if table[i] is not None:

1041

table[i].append(set())

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1042

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame^]

1043

for r, p in UcdFile(DERIVED_CORE_PROPERTIES, version):

1044

if ".." in r:

1045

first, last = [int(c, 16) for c in r.split('..')]

1046

chars = list(range(first, last+1))

else:

chars = [int(r, 16)]

for char in chars:

if table[char]:

# Some properties (e.g. Default_Ignorable_Code_Point)

1052

# apply to unassigned code points; ignore them

1053

table[char][-1].add(p)

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1054

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame^]

1055

for s in UcdFile(LINE_BREAK, version):

1056

if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:

1057

continue

1058

if '..' not in s[0]:

1059

first = last = int(s[0], 16)

1060

else:

1061

first, last = [int(c, 16) for c in s[0].split('..')]

1062

for char in range(first, last+1):

1063

table[char][-1].add('Line_Break')

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

1064

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1065

# We only want the quickcheck properties

1066

# Format: NF?_QC; Y(es)/N(o)/M(aybe)

1067

# Yes is the default, hence only N and M occur

1068

# In 3.2.0, the format was different (NF?_NO)

1069

# The parsing will incorrectly determine these as

1070

# "yes", however, unicodedata.c will not perform quickchecks

1071

# for older versions, and no delta records will be created.

1072

quickchecks = [0] * 0x110000

1073

qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame^]

1074

for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version):

1075

if len(s) < 2 or s[1] not in qc_order:

1076

continue

1077

quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No

1078

quickcheck_shift = qc_order.index(s[1])*2

1079

quickcheck <<= quickcheck_shift

1080

if '..' not in s[0]:

1081

first = last = int(s[0], 16)

1082

else:

1083

first, last = [int(c, 16) for c in s[0].split('..')]

1084

for char in range(first, last+1):

1085

assert not (quickchecks[char]>>quickcheck_shift)&3

1086

quickchecks[char] |= quickcheck

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1087

for i in range(0, 0x110000):

1088

if table[i] is not None:

1089

table[i].append(quickchecks[i])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

1090

Ezio Melotti

2a1e926

2011-09-30 08:46:25 +0300

[diff] [blame]

1091

with open_data(UNIHAN, version) as file:

1092

zip = zipfile.ZipFile(file)

1093

if version == '3.2.0':

1094

data = zip.open('Unihan-3.2.0.txt').read()

1095

else:

1096

data = zip.open('Unihan_NumericValues.txt').read()

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1097

for line in data.decode("utf-8").splitlines():

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

1098

if not line.startswith('U+'):

1099

continue

1100

code, tag, value = line.split(None, 3)[:3]

1101

if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',

1102

'kOtherNumeric'):

1103

continue

1104

value = value.strip().replace(',', '')

1105

i = int(code[2:], 16)

1106

# Patch the numeric field

1107

if table[i] is not None:

1108

table[i][8] = value

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame^]

1109

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

1110

sc = self.special_casing = {}

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame^]

1111

for data in UcdFile(SPECIAL_CASING, version):

1112

if data[4]:

1113

# We ignore all conditionals (since they depend on

1114

# languages) except for one, which is hardcoded. See

1115

# handle_capital_sigma in unicodeobject.c.

1116

continue

1117

c = int(data[0], 16)

1118

lower = [int(char, 16) for char in data[1].split()]

1119

title = [int(char, 16) for char in data[2].split()]

1120

upper = [int(char, 16) for char in data[3].split()]

1121

sc[c] = (lower, title, upper)

1122

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

1123

cf = self.case_folding = {}

1124

if version != '3.2.0':

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame^]

1125

for data in UcdFile(CASE_FOLDING, version):

1126

if data[1] in "CF":

1127

c = int(data[0], 16)

1128

cf[c] = [int(char, 16) for char in data[2].split()]

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

1129

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1130

def uselatin1(self):

1131

# restrict character range to ISO Latin 1

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

1132

self.chars = list(range(256))

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1133

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

1134

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1135

# hash table tools

1136

1137

# this is a straight-forward reimplementation of Python's built-in

1138

# dictionary type, using a static data structure, and a custom string

1139

# hash algorithm.

1140

1141

def myhash(s, magic):

1142

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1143

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1144

h = (h * magic) + c

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

1145

ix = h & 0xff000000

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1146

if ix:

1147

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

1148

return h

1149

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

1150

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1151

SIZES = [

1152

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

1153

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

1154

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

1155

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

1156

]

1157

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

1158

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1159

class Hash:

1160

def __init__(self, name, data, magic):

1161

# turn a (key, value) list into a static hash table structure

1162

1163

# determine table size

1164

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

Ezio Melotti

1392500

2011-03-16 11:05:33 +0200

[diff] [blame]

1169

raise AssertionError("ran out of polynomials")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1170

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1171

print(size, "slots in hash table")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1172

1173

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

1182

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

1189

incr = (h ^ (h >> 3)) & mask

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1203

print(n, "collisions")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1204

self.collisions = n

1205

1206

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1216

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1217

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1218

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1219

file.write("#define %s_magic %d\n" % (self.name, self.magic))

1220

file.write("#define %s_size %d\n" % (self.name, self.size))

1221

file.write("#define %s_poly %d\n" % (self.name, self.poly))

1222

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

1223

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1224

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1232

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1233

# write data to file, as a C array

1234

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1235

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1236

print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)

Inada Naoki

6fec905

2019-04-17 08:40:34 +0900

[diff] [blame]

1237

file.write("static const ")

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1238

if size == 1:

1239

file.write("unsigned char")

1240

elif size == 2:

1241

file.write("unsigned short")

1242

else:

1243

file.write("unsigned int")

1244

file.write(" " + self.name + "[] = {\n")

1245

if self.data:

1246

s = " "

1247

for item in self.data:

1248

i = str(item) + ", "

1249

if len(s) + len(i) > 78:

Benjamin Peterson

279a962

2017-06-22 22:31:08 -0700

[diff] [blame]

1250

file.write(s.rstrip() + "\n")

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1251

s = " " + i

1252

else:

1253

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1254

if s.strip():

Benjamin Peterson

279a962

2017-06-22 22:31:08 -0700

[diff] [blame]

1255

file.write(s.rstrip() + "\n")

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1256

file.write("};\n\n")

1257

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

1258

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1259

def getsize(data):

1260

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

1269

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1270

def splitbins(t, trace=0):

1271

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

1272

1273

t is a sequence of ints. This function can be useful to save space if

1274

many of the ints are the same. t1 and t2 are lists of ints, and shift

1275

is an int, chosen to minimize the combined size of t1 and t2 (in C

1276

code), and where for each i in range(len(t)),

1277

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1278

where mask is a bitmask isolating the last "shift" bits.

1279

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1280

If optional arg trace is non-zero (default zero), progress info

1281

is printed to sys.stderr. The higher the value, the more info

1282

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1283

"""

1284

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1285

if trace:

1286

def dump(t1, t2, shift, bytes):

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1287

print("%d+%d bins at shift %d; %d bytes" % (

1288

len(t1), len(t2), shift, bytes), file=sys.stderr)

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

1289

print("Size of original table:", len(t)*getsize(t), "bytes",

1290

file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1291

n = len(t)-1 # last valid index

1292

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

Christian Heimes

a37d4c6

2007-12-04 23:02:19 +0000

[diff] [blame]

1298

bytes = sys.maxsize # smallest total size so far

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1299

t = tuple(t) # so slices can be dict keys

1300

for shift in range(maxshift + 1):

1301

t1 = []

1302

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1303

size = 2**shift

1304

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1305

for i in range(0, len(t), size):

1306

bin = t[i:i+size]

1307

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1308

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1309

index = len(t2)

1310

bincache[bin] = index

1311

t2.extend(bin)

1312

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1313

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1314

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1315

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1316

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1317

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1318

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1319

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1320

t1, t2, shift = best

1321

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1322

print("Best:", end=' ', file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1323

dump(t1, t2, shift, bytes)

1324

if __debug__:

1325

# exhaustively verify that the decomposition is correct

1326

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

Guido van Rossum

805365e

2007-05-07 22:24:25 +0000

[diff] [blame]

1327

for i in range(len(t)):

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1328

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1329

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1330

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

1331

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1332

if __name__ == "__main__":

Fredrik Lundh