Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython3

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Benjamin Peterson

7c69c1c

2018-06-06 20:14:28 -0700

[diff] [blame]

4

# This script converts Unicode database files to Modules/unicodedata_db.h,

5

# Modules/unicodename_db.h, and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

6

#

7

# history:

8

# 2000-09-24 fl created (based on bits and pieces from unidb)

9

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

10

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

11

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

12

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

13

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

14

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

15

# 2002-09-11 wd use string methods

16

# 2002-10-18 mvl update to Unicode 3.2

17

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

97225da

2002-11-24 23:05:09 +0000

[diff] [blame]

18

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

19

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

20

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

21

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

22

# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

23

# 2011-10-21 ezio add support for name aliases and named sequences

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

24

# 2012-01 benjamin add full case mappings

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

25

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

26

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

27

#

28

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

import os

import sys

import zipfile

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

33

from functools import partial

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame]

34

from textwrap import dedent

35

from typing import *

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

36

37

SCRIPT = sys.argv[0]

Benjamin Peterson

7c69c1c

2018-06-06 20:14:28 -0700

[diff] [blame]

38

VERSION = "3.3"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

39

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

40

# The Unicode Database

R David Murray

7445a38

2014-10-09 17:30:33 -0400

[diff] [blame]

41

# --------------------

42

# When changing UCD version please update

43

# * Doc/library/stdtypes.rst, and

44

# * Doc/library/unicodedata.rst

R David Murray

5f16f90

2014-10-09 20:45:59 -0400

[diff] [blame]

45

# * Doc/reference/lexical_analysis.rst (two occurrences)

Benjamin Peterson

3aca40d

2019-05-08 20:59:35 -0700

[diff] [blame]

46

UNIDATA_VERSION = "12.1.0"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

47

UNICODE_DATA = "UnicodeData%s.txt"

48

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

49

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

50

UNIHAN = "Unihan%s.zip"

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

51

DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

52

DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

53

LINE_BREAK = "LineBreak%s.txt"

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

54

NAME_ALIASES = "NameAliases%s.txt"

55

NAMED_SEQUENCES = "NamedSequences%s.txt"

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

56

SPECIAL_CASING = "SpecialCasing%s.txt"

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

57

CASE_FOLDING = "CaseFolding%s.txt"

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

58

59

# Private Use Areas -- in planes 1, 15, 16

60

PUA_1 = range(0xE000, 0xF900)

61

PUA_15 = range(0xF0000, 0xFFFFE)

62

PUA_16 = range(0x100000, 0x10FFFE)

63

64

# we use this ranges of PUA_15 to store name aliases and named sequences

65

NAME_ALIASES_START = 0xF0000

Benjamin Peterson

71f660e

2012-02-20 22:24:29 -0500

[diff] [blame]

66

NAMED_SEQUENCES_START = 0xF0200

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

67

68

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

69

70

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

71

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

72

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

73

"So" ]

74

75

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

76

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

Benjamin Peterson

94d08d9

2013-10-10 17:24:45 -0400

[diff] [blame]

77

"ON", "LRI", "RLI", "FSI", "PDI" ]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

78

Hye-Shik Chang

e9ddfbb

2004-08-04 07:38:35 +0000

[diff] [blame]

79

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

80

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

81

MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]

82

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

83

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

88

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

89

SPACE_MASK = 0x20

90

TITLE_MASK = 0x40

91

UPPER_MASK = 0x80

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

92

XID_START_MASK = 0x100

93

XID_CONTINUE_MASK = 0x200

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

94

PRINTABLE_MASK = 0x400

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

95

NUMERIC_MASK = 0x800

96

CASE_IGNORABLE_MASK = 0x1000

97

CASED_MASK = 0x2000

98

EXTENDED_CASE_MASK = 0x4000

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

99

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

100

# these ranges need to match unicodedata.c:is_unified_ideograph

101

cjk_ranges = [

102

('3400', '4DB5'),

Benjamin Peterson

7c69c1c

2018-06-06 20:14:28 -0700

[diff] [blame]

103

('4E00', '9FEF'),

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

104

('20000', '2A6D6'),

105

('2A700', '2B734'),

Benjamin Peterson

4801383

2015-06-27 15:45:56 -0500

[diff] [blame]

106

('2B740', '2B81D'),

107

('2B820', '2CEA1'),

Benjamin Peterson

279a962

2017-06-22 22:31:08 -0700

[diff] [blame]

108

('2CEB0', '2EBE0'),

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

109

]

110

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

111

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

112

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

113

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

114

print("--- Reading", UNICODE_DATA % "", "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

115

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

116

unicode = UnicodeData(UNIDATA_VERSION)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

117

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

118

print(len(list(filter(None, unicode.table))), "characters")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

119

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

120

for version in old_versions:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

121

print("--- Reading", UNICODE_DATA % ("-"+version), "...")

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

122

old_unicode = UnicodeData(version, cjk_check=False)

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

123

print(len(list(filter(None, old_unicode.table))), "characters")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

124

merge_old_version(version, unicode, old_unicode)

125

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

126

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

127

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

128

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

129

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

130

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

131

# --------------------------------------------------------------------

132

# unicode character properties

133

134

def makeunicodedata(unicode, trace):

135

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

136

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

137

table = [dummy]

138

cache = {0: dummy}

139

index = [0] * len(unicode.chars)

140

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

141

FILE = "Modules/unicodedata_db.h"

142

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

143

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

144

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

145

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

146

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

147

for char in unicode.chars:

148

record = unicode.table[char]

149

if record:

150

# extract database properties

151

category = CATEGORY_NAMES.index(record[2])

152

combining = int(record[3])

153

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

154

mirrored = record[9] == "Y"

Hye-Shik Chang

e9ddfbb

2004-08-04 07:38:35 +0000

[diff] [blame]

155

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

156

normalizationquickcheck = record[17]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

157

item = (

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

158

category, combining, bidirectional, mirrored, eastasianwidth,

159

normalizationquickcheck

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

160

)

161

# add entry to index and item tables

162

i = cache.get(item)

163

if i is None:

164

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

168

# 2) decomposition data

169

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

170

decomp_data = [0]

171

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

172

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

173

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

174

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

175

comp_pairs = []

176

comp_first = [None] * len(unicode.chars)

177

comp_last = [None] * len(unicode.chars)

178

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

179

for char in unicode.chars:

180

record = unicode.table[char]

181

if record:

182

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

183

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

184

if len(decomp) > 19:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

185

raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

186

# prefix

187

if decomp[0][0] == "<":

188

prefix = decomp.pop(0)

189

else:

190

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

191

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

192

i = decomp_prefix.index(prefix)

193

except ValueError:

194

i = len(decomp_prefix)

195

decomp_prefix.append(prefix)

196

prefix = i

197

assert prefix < 256

198

# content

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

199

decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

200

# Collect NFC pairs

201

if not prefix and len(decomp) == 3 and \

202

char not in unicode.exclusions and \

203

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

208

try:

209

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

210

except ValueError:

211

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

212

decomp_data.extend(decomp)

213

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

214

else:

215

i = 0

216

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

217

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

218

f = l = 0

219

comp_first_ranges = []

220

comp_last_ranges = []

221

prev_f = prev_l = None

222

for i in unicode.chars:

223

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

229

prev_f = prev_f[0],i

230

else:

231

comp_first_ranges.append(prev_f)

232

prev_f = (i,i)

233

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

239

prev_l = prev_l[0],i

240

else:

241

comp_last_ranges.append(prev_l)

242

prev_l = (i,i)

243

comp_first_ranges.append(prev_f)

244

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

249

for f,l,char in comp_pairs:

250

f = comp_first[f]

251

l = comp_last[l]

252

comp_data[f*total_last+l] = char

253

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

254

print(len(table), "unique properties")

255

print(len(decomp_prefix), "unique decomposition prefixes")

256

print(len(decomp_data), "unique decomposition entries:", end=' ')

257

print(decomp_size, "bytes")

258

print(total_first, "first characters in NFC")

259

print(total_last, "last characters in NFC")

260

print(len(comp_pairs), "NFC pairs")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

261

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

262

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

263

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

264

with open(FILE, "w") as fp:

265

fprint = partial(print, file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

266

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

267

fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))

268

fprint()

269

fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)

270

fprint("/* a list of unique database records */")

271

fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")

272

for item in table:

273

fprint(" {%d, %d, %d, %d, %d, %d}," % item)

274

fprint("};")

275

fprint()

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

276

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

277

fprint("/* Reindexing of NFC first characters. */")

278

fprint("#define TOTAL_FIRST",total_first)

279

fprint("#define TOTAL_LAST",total_last)

280

fprint("struct reindex{int start;short count,index;};")

281

fprint("static struct reindex nfc_first[] = {")

282

for start,end in comp_first_ranges:

283

fprint(" { %d, %d, %d}," % (start,end-start,comp_first[start]))

284

fprint(" {0,0,0}")

285

fprint("};\n")

286

fprint("static struct reindex nfc_last[] = {")

287

for start,end in comp_last_ranges:

288

fprint(" { %d, %d, %d}," % (start,end-start,comp_last[start]))

289

fprint(" {0,0,0}")

290

fprint("};\n")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

291

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

292

# FIXME: <fl> the following tables could be made static, and

293

# the support code moved into unicodedatabase.c

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

294

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

295

fprint("/* string literals */")

296

fprint("const char *_PyUnicode_CategoryNames[] = {")

297

for name in CATEGORY_NAMES:

298

fprint(" \"%s\"," % name)

299

fprint(" NULL")

300

fprint("};")

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

301

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

302

fprint("const char *_PyUnicode_BidirectionalNames[] = {")

303

for name in BIDIRECTIONAL_NAMES:

304

fprint(" \"%s\"," % name)

305

fprint(" NULL")

306

fprint("};")

Hye-Shik Chang

e9ddfbb

2004-08-04 07:38:35 +0000

[diff] [blame]

307

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

308

fprint("const char *_PyUnicode_EastAsianWidthNames[] = {")

309

for name in EASTASIANWIDTH_NAMES:

310

fprint(" \"%s\"," % name)

311

fprint(" NULL")

312

fprint("};")

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

313

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

314

fprint("static const char *decomp_prefix[] = {")

315

for name in decomp_prefix:

316

fprint(" \"%s\"," % name)

317

fprint(" NULL")

318

fprint("};")

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

319

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

320

# split record index table

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

321

index1, index2, shift = splitbins(index, trace)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

322

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

323

fprint("/* index tables for the database records */")

324

fprint("#define SHIFT", shift)

325

Array("index1", index1).dump(fp, trace)

326

Array("index2", index2).dump(fp, trace)

327

328

# split decomposition index table

329

index1, index2, shift = splitbins(decomp_index, trace)

330

331

fprint("/* decomposition data */")

332

Array("decomp_data", decomp_data).dump(fp, trace)

333

334

fprint("/* index tables for the decomposition data */")

335

fprint("#define DECOMP_SHIFT", shift)

336

Array("decomp_index1", index1).dump(fp, trace)

337

Array("decomp_index2", index2).dump(fp, trace)

338

339

index, index2, shift = splitbins(comp_data, trace)

340

fprint("/* NFC pairs */")

341

fprint("#define COMP_SHIFT", shift)

342

Array("comp_index", index).dump(fp, trace)

343

Array("comp_data", index2).dump(fp, trace)

344

345

# Generate delta tables for old versions

346

for version, table, normalization in unicode.changed:

347

cversion = version.replace(".","_")

348

records = [table[0]]

349

cache = {table[0]:0}

350

index = [0] * len(table)

351

for i, record in enumerate(table):

352

try:

353

index[i] = cache[record]

354

except KeyError:

355

index[i] = cache[record] = len(records)

356

records.append(record)

357

index1, index2, shift = splitbins(index, trace)

358

fprint("static const change_record change_records_%s[] = {" % cversion)

359

for record in records:

360

fprint(" { %s }," % ", ".join(map(str,record)))

361

fprint("};")

362

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

363

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

364

fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion)

365

fprint("{")

366

fprint(" int index;")

367

fprint(" if (n >= 0x110000) index = 0;")

368

fprint(" else {")

369

fprint(" index = changes_%s_index[n>>%d];" % (cversion, shift))

370

fprint(" index = changes_%s_data[(index<<%d)+(n & %d)];" % \

371

(cversion, shift, ((1<<shift)-1)))

372

fprint(" }")

373

fprint(" return change_records_%s+index;" % cversion)

374

fprint("}\n")

375

fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion)

376

fprint("{")

377

fprint(" switch(n) {")

378

for k, v in normalization:

379

fprint(" case %s: return 0x%s;" % (hex(k), v))

380

fprint(" default: return 0;")

381

fprint(" }\n}\n")

382

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

383

384

# --------------------------------------------------------------------

385

# unicode character type tables

386

387

def makeunicodetype(unicode, trace):

388

389

FILE = "Objects/unicodetype_db.h"

390

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

391

print("--- Preparing", FILE, "...")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

392

393

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

394

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

395

table = [dummy]

396

cache = {0: dummy}

397

index = [0] * len(unicode.chars)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

398

numeric = {}

399

spaces = []

400

linebreaks = []

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

401

extra_casing = []

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

402

403

for char in unicode.chars:

404

record = unicode.table[char]

405

if record:

406

# extract database properties

407

category = record[2]

408

bidirectional = record[4]

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

409

properties = record[16]

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

410

flags = 0

411

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

412

flags |= ALPHA_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

413

if "Lowercase" in properties:

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

414

flags |= LOWER_MASK

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

415

if 'Line_Break' in properties or bidirectional == "B":

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

416

flags |= LINEBREAK_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

417

linebreaks.append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

418

if category == "Zs" or bidirectional in ("WS", "B", "S"):

419

flags |= SPACE_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

420

spaces.append(char)

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

421

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

422

flags |= TITLE_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

423

if "Uppercase" in properties:

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

424

flags |= UPPER_MASK

Benjamin Peterson

0983274

2009-03-26 17:15:46 +0000

[diff] [blame]

425

if char == ord(" ") or category[0] not in ("C", "Z"):

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

426

flags |= PRINTABLE_MASK

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

427

if "XID_Start" in properties:

428

flags |= XID_START_MASK

429

if "XID_Continue" in properties:

430

flags |= XID_CONTINUE_MASK

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

431

if "Cased" in properties:

432

flags |= CASED_MASK

433

if "Case_Ignorable" in properties:

434

flags |= CASE_IGNORABLE_MASK

435

sc = unicode.special_casing.get(char)

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

436

cf = unicode.case_folding.get(char, [char])

437

if record[12]:

438

upper = int(record[12], 16)

else:

upper = char

if record[13]:

lower = int(record[13], 16)

else:

lower = char

if record[14]:

title = int(record[14], 16)

447

else:

448

title = upper

449

if sc is None and cf != [lower]:

450

sc = ([lower], [title], [upper])

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

451

if sc is None:

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

452

if upper == lower == title:

453

upper = lower = title = 0

Benjamin Peterson

ad9c569

2012-01-15 21:19:20 -0500

[diff] [blame]

else:

upper = upper - char

lower = lower - char

title = title - char

assert (abs(upper) <= 2147483647 and

459

abs(lower) <= 2147483647 and

460

abs(title) <= 2147483647)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

461

else:

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

462

# This happens either when some character maps to more than one

463

# character in uppercase, lowercase, or titlecase or the

464

# casefolded version of the character is different from the

465

# lowercase. The extra characters are stored in a different

466

# array.

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

467

flags |= EXTENDED_CASE_MASK

468

lower = len(extra_casing) | (len(sc[0]) << 24)

469

extra_casing.extend(sc[0])

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

470

if cf != sc[0]:

471

lower |= len(cf) << 20

472

extra_casing.extend(cf)

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

473

upper = len(extra_casing) | (len(sc[2]) << 24)

474

extra_casing.extend(sc[2])

475

# Title is probably equal to upper.

if sc[1] == sc[2]:

title = upper

else:

title = len(extra_casing) | (len(sc[1]) << 24)

480

extra_casing.extend(sc[1])

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

481

# decimal digit, integer digit

482

decimal = 0

483

if record[6]:

484

flags |= DECIMAL_MASK

485

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

490

if record[8]:

491

flags |= NUMERIC_MASK

492

numeric.setdefault(record[8], []).append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

493

item = (

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

494

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

495

)

496

# add entry to index and item tables

497

i = cache.get(item)

498

if i is None:

499

cache[item] = i = len(table)

table.append(item)

index[char] = i

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

503

print(len(table), "unique character type entries")

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

504

print(sum(map(len, numeric.values())), "numeric code points")

505

print(len(spaces), "whitespace code points")

506

print(len(linebreaks), "linebreak code points")

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

507

print(len(extra_casing), "extended case array")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

508

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

509

print("--- Writing", FILE, "...")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

510

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

511

with open(FILE, "w") as fp:

512

fprint = partial(print, file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

513

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

514

fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))

515

fprint()

516

fprint("/* a list of unique character type descriptors */")

517

fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")

518

for item in table:

519

fprint(" {%d, %d, %d, %d, %d, %d}," % item)

520

fprint("};")

521

fprint()

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

522

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

523

fprint("/* extended case mappings */")

524

fprint()

525

fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {")

526

for c in extra_casing:

527

fprint(" %d," % c)

528

fprint("};")

529

fprint()

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

530

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

531

# split decomposition index table

532

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

533

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

534

fprint("/* type indexes */")

535

fprint("#define SHIFT", shift)

536

Array("index1", index1).dump(fp, trace)

537

Array("index2", index2).dump(fp, trace)

Amaury Forgeot d'Arc

919765a

2009-10-13 23:18:53 +0000

[diff] [blame]

538

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

539

# Generate code for _PyUnicode_ToNumeric()

540

numeric_items = sorted(numeric.items())

541

fprint('/* Returns the numeric value as double for Unicode characters')

542

fprint(' * having this property, -1.0 otherwise.')

543

fprint(' */')

544

fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)')

545

fprint('{')

546

fprint(' switch (ch) {')

547

for value, codepoints in numeric_items:

548

# Turn text into float literals

549

parts = value.split('/')

550

parts = [repr(float(part)) for part in parts]

551

value = '/'.join(parts)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

552

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

553

codepoints.sort()

554

for codepoint in codepoints:

555

fprint(' case 0x%04X:' % (codepoint,))

556

fprint(' return (double) %s;' % (value,))

557

fprint(' }')

558

fprint(' return -1.0;')

559

fprint('}')

560

fprint()

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

561

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

562

# Generate code for _PyUnicode_IsWhitespace()

563

fprint("/* Returns 1 for Unicode characters having the bidirectional")

564

fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.")

565

fprint(" */")

566

fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)')

567

fprint('{')

568

fprint(' switch (ch) {')

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

569

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

570

for codepoint in sorted(spaces):

571

fprint(' case 0x%04X:' % (codepoint,))

572

fprint(' return 1;')

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

573

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

fprint(' }')

fprint(' return 0;')

fprint('}')

fprint()

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

578

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

579

# Generate code for _PyUnicode_IsLinebreak()

580

fprint("/* Returns 1 for Unicode characters having the line break")

581

fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional")

582

fprint(" * type 'B', 0 otherwise.")

583

fprint(" */")

584

fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)')

585

fprint('{')

586

fprint(' switch (ch) {')

587

for codepoint in sorted(linebreaks):

588

fprint(' case 0x%04X:' % (codepoint,))

589

fprint(' return 1;')

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

590

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

fprint(' }')

fprint(' return 0;')

fprint('}')

fprint()

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

596

597

# --------------------------------------------------------------------

598

# unicode name database

599

600

def makeunicodename(unicode, trace):

601

602

FILE = "Modules/unicodename_db.h"

603

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

604

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

605

606

# collect names

607

names = [None] * len(unicode.chars)

608

609

for char in unicode.chars:

610

record = unicode.table[char]

611

if record:

612

name = record[1].strip()

613

if name and name[0] != "<":

614

names[char] = name + chr(0)

615

Jon Dufresne

3972628

2017-05-18 07:35:54 -0700

[diff] [blame]

616

print(len([n for n in names if n is not None]), "distinct names")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

617

618

# collect unique words from names (note that we differ between

619

# words inside a sentence, and words ending a sentence. the

620

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

636

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

637

print(n, "words in text;", b, "bytes")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

638

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

639

wordlist = list(words.items())

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

640

Martin v. Löwis

97225da

2002-11-24 23:05:09 +0000

[diff] [blame]

641

# sort on falling frequency, then by name

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

642

def word_key(a):

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

643

aword, alist = a

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

644

return -len(alist), aword

645

wordlist.sort(key=word_key)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

646

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

647

# figure out how many phrasebook escapes we need

648

escapes = 0

649

while escapes * 256 < len(wordlist):

650

escapes = escapes + 1

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

651

print(escapes, "escapes")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

652

653

short = 256 - escapes

assert short > 0

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

657

print(short, "short indexes in lexicon")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

658

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

659

# statistics

660

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

661

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

662

n = n + len(wordlist[i][1])

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

663

print(n, "short indexes in phrasebook")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

664

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

665

# pick the most commonly used words, and sort the rest on falling

666

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

667

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

668

wordlist, wordtail = wordlist[:short], wordlist[short:]

Raymond Hettinger

d4cb56d

2008-01-30 02:55:10 +0000

[diff] [blame]

669

wordtail.sort(key=lambda a: a[0], reverse=True)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

670

wordlist.extend(wordtail)

671

672

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

679

offset = 0

680

for w, x in wordlist:

681

# encoding: bit 7 indicates last character in word (chr(128)

682

# indicates the last character in an entire string)

683

ww = w[:-1] + chr(ord(w[-1])+128)

684

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

685

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

686

if o < 0:

687

o = offset

688

lexicon = lexicon + ww

689

offset = offset + len(w)

690

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

691

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

692

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

693

lexicon = list(map(ord, lexicon))

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

694

695

# generate phrasebook from names and lexicon

696

phrasebook = [0]

697

phrasebook_offset = [0] * len(unicode.chars)

698

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

703

for w in w:

704

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

705

if i < short:

706

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

707

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

708

# store as two bytes

709

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

710

phrasebook.append(i&255)

711

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

712

assert getsize(phrasebook) == 1

713

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

714

#

715

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

720

record = unicode.table[char]

721

if record:

722

name = record[1].strip()

723

if name and name[0] != "<":

724

data.append((name, char))

725

726

# the magic number 47 was chosen to minimize the number of

727

# collisions on the current data set. if you like, change it

728

# and see what happens...

729

730

codehash = Hash("code", data, 47)

731

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

732

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

733

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

734

with open(FILE, "w") as fp:

735

fprint = partial(print, file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

736

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

737

fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))

738

fprint()

739

fprint("#define NAME_MAXLEN", 256)

740

fprint()

741

fprint("/* lexicon */")

742

Array("lexicon", lexicon).dump(fp, trace)

743

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

744

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

745

# split decomposition index table

746

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

747

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

748

fprint("/* code->name phrasebook */")

749

fprint("#define phrasebook_shift", shift)

750

fprint("#define phrasebook_short", short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

751

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

752

Array("phrasebook", phrasebook).dump(fp, trace)

753

Array("phrasebook_offset1", offset1).dump(fp, trace)

754

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

755

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

756

fprint("/* name->code dictionary */")

757

codehash.dump(fp, trace)

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

758

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

759

fprint()

760

fprint('static const unsigned int aliases_start = %#x;' %

761

NAME_ALIASES_START)

762

fprint('static const unsigned int aliases_end = %#x;' %

763

(NAME_ALIASES_START + len(unicode.aliases)))

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

764

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

765

fprint('static const unsigned int name_aliases[] = {')

766

for name, codepoint in unicode.aliases:

767

fprint(' 0x%04X,' % codepoint)

768

fprint('};')

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

769

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

770

# In Unicode 6.0.0, the sequences contain at most 4 BMP chars,

771

# so we are using Py_UCS2 seq[4]. This needs to be updated if longer

772

# sequences or sequences with non-BMP chars are added.

773

# unicodedata_lookup should be adapted too.

774

fprint(dedent("""

775

typedef struct NamedSequence {

int seqlen;

Py_UCS2 seq[4];

} named_sequence;

"""))

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

780

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

781

fprint('static const unsigned int named_sequences_start = %#x;' %

782

NAMED_SEQUENCES_START)

783

fprint('static const unsigned int named_sequences_end = %#x;' %

784

(NAMED_SEQUENCES_START + len(unicode.named_sequences)))

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

785

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

786

fprint('static const named_sequence named_sequences[] = {')

787

for name, sequence in unicode.named_sequences:

788

seq_str = ', '.join('0x%04X' % cp for cp in sequence)

789

fprint(' {%d, {%s}},' % (len(sequence), seq_str))

790

fprint('};')

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

791

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

792

793

def merge_old_version(version, new, old):

794

# Changes to exclusion file not implemented yet

795

if old.exclusions != new.exclusions:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

796

raise NotImplementedError("exclusions differ")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

797

798

# In these change records, 0xFF means "no change"

799

bidir_changes = [0xFF]*0x110000

800

category_changes = [0xFF]*0x110000

801

decimal_changes = [0xFF]*0x110000

Martin v. Löwis

93cbca3

2008-09-10 14:08:48 +0000

[diff] [blame]

802

mirrored_changes = [0xFF]*0x110000

Benjamin Peterson

6775231

2016-09-14 23:53:47 -0700

[diff] [blame]

803

east_asian_width_changes = [0xFF]*0x110000

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

804

# In numeric data, 0 means "no change",

805

# -1 means "did not have a numeric value

806

numeric_changes = [0] * 0x110000

807

# normalization_changes is a list of key-value pairs

808

normalization_changes = []

809

for i in range(0x110000):

810

if new.table[i] is None:

811

# Characters unassigned in the new version ought to

812

# be unassigned in the old one

813

assert old.table[i] is None

814

continue

815

# check characters unassigned in the old version

816

if old.table[i] is None:

817

# category 0 is "unassigned"

818

category_changes[i] = 0

819

continue

820

# check characters that differ

821

if old.table[i] != new.table[i]:

822

for k in range(len(old.table[i])):

823

if old.table[i][k] != new.table[i][k]:

824

value = old.table[i][k]

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

825

if k == 1 and i in PUA_15:

826

# the name is not set in the old.table, but in the

827

# new.table we are using it for aliases and named seq

828

assert value == ''

829

elif k == 2:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

830

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

831

category_changes[i] = CATEGORY_NAMES.index(value)

832

elif k == 4:

833

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

834

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

835

elif k == 5:

836

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

837

# We assume that all normalization changes are in 1:1 mappings

838

assert " " not in value

839

normalization_changes.append((i, value))

840

elif k == 6:

841

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

842

# we only support changes where the old value is a single digit

843

assert value in "0123456789"

844

decimal_changes[i] = int(value)

845

elif k == 8:

846

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

847

# Since 0 encodes "no change", the old value is better not 0

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

848

if not value:

849

numeric_changes[i] = -1

850

else:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

851

numeric_changes[i] = float(value)

852

assert numeric_changes[i] not in (0, -1)

Martin v. Löwis

93cbca3

2008-09-10 14:08:48 +0000

[diff] [blame]

853

elif k == 9:

854

if value == 'Y':

855

mirrored_changes[i] = '1'

856

else:

857

mirrored_changes[i] = '0'

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

858

elif k == 11:

859

# change to ISO comment, ignore

860

pass

861

elif k == 12:

862

# change to simple uppercase mapping; ignore

863

pass

864

elif k == 13:

865

# change to simple lowercase mapping; ignore

866

pass

867

elif k == 14:

868

# change to simple titlecase mapping; ignore

869

pass

Benjamin Peterson

6775231

2016-09-14 23:53:47 -0700

[diff] [blame]

870

elif k == 15:

871

# change to east asian width

872

east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

873

elif k == 16:

874

# derived property changes; not yet

875

pass

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

876

elif k == 17:

877

# normalization quickchecks are not performed

878

# for older versions

879

pass

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

880

else:

881

class Difference(Exception):pass

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

882

raise Difference(hex(i), k, old.table[i], new.table[i])

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

883

new.changed.append((version, list(zip(bidir_changes, category_changes,

Benjamin Peterson

6775231

2016-09-14 23:53:47 -0700

[diff] [blame]

884

decimal_changes, mirrored_changes,

885

east_asian_width_changes,

886

numeric_changes)),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

887

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

888

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

889

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

890

def open_data(template, version):

891

local = template % ('-'+version,)

892

if not os.path.exists(local):

893

import urllib.request

894

if version == '3.2.0':

895

# irregular url structure

896

url = 'http://www.unicode.org/Public/3.2-Update/' + local

897

else:

898

url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')

899

urllib.request.urlretrieve(url, filename=local)

900

if local.endswith('.txt'):

901

return open(local, encoding='utf-8')

902

else:

903

# Unihan.zip

904

return open(local, 'rb')

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

905

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

906

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame]

907

class UcdFile:

908

'''

909

A file in the standard format of the UCD.

910

911

See: https://www.unicode.org/reports/tr44/#Format_Conventions

912

913

Note that, as described there, the Unihan data files have their

own separate format.

'''

def __init__(self, template: str, version: str) -> None:

918

self.template = template

919

self.version = version

920

921

def records(self) -> Iterator[List[str]]:

922

with open_data(self.template, self.version) as file:

923

for line in file:

924

line = line.split('#', 1)[0].strip()

925

if not line:

926

continue

927

yield [field.strip() for field in line.split(';')]

928

929

def __iter__(self) -> Iterator[List[str]]:

930

return self.records()

931

932

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

933

# --------------------------------------------------------------------

934

# the following support code is taken from the unidb utilities

935

936

937

# load a unicode-data file from disk

938

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

939

class UnicodeData:

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

940

# Record structure:

941

# [ID, name, category, combining, bidi, decomp, (6)

942

# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)

943

# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)

944

# derived-props] (17)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

945

Greg Price

99d208e

2019-08-12 22:59:30 -0700

[diff] [blame^]

946

def __init__(self, version, cjk_check=True):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

947

self.changed = []

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

948

table = [None] * 0x110000

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame]

949

for s in UcdFile(UNICODE_DATA, version):

950

char = int(s[0], 16)

951

table[char] = s

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

952

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

953

cjk_ranges_found = []

954

Martin v. Löwis

97225da

2002-11-24 23:05:09 +0000

[diff] [blame]

955

# expand first-last ranges

Greg Price

99d208e

2019-08-12 22:59:30 -0700

[diff] [blame^]

956

field = None

957

for i in range(0, 0x110000):

958

s = table[i]

959

if s:

960

if s[1][-6:] == "First>":

961

s[1] = ""

962

field = s

963

elif s[1][-5:] == "Last>":

964

if s[1].startswith("<CJK Ideograph"):

965

cjk_ranges_found.append((field[0],

s[0]))

s[1] = ""

field = None

elif field:

f2 = field[:]

f2[0] = "%X" % i

table[i] = f2

if cjk_check and cjk_ranges != cjk_ranges_found:

974

raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

975

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

976

# public attributes

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

977

self.filename = UNICODE_DATA % ''

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

978

self.table = table

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

979

self.chars = list(range(0x110000)) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

980

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

981

# check for name aliases and named sequences, see #12753

982

# aliases and named sequences are not in 3.2.0

983

if version != '3.2.0':

984

self.aliases = []

985

# store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,

986

# in order to take advantage of the compression and lookup

987

# algorithms used for the other characters

988

pua_index = NAME_ALIASES_START

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame]

989

for char, name, abbrev in UcdFile(NAME_ALIASES, version):

990

char = int(char, 16)

991

self.aliases.append((name, char))

992

# also store the name in the PUA 1

993

self.table[pua_index][1] = name

994

pua_index += 1

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

995

assert pua_index - NAME_ALIASES_START == len(self.aliases)

996

997

self.named_sequences = []

Ezio Melotti

7c4a7e6

2013-08-26 01:32:56 +0300

[diff] [blame]

998

# store named sequences in the PUA 1, in range U+F0100..,

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

999

# in order to take advantage of the compression and lookup

1000

# algorithms used for the other characters.

1001

Benjamin Peterson

71f660e

2012-02-20 22:24:29 -0500

[diff] [blame]

1002

assert pua_index < NAMED_SEQUENCES_START

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

1003

pua_index = NAMED_SEQUENCES_START

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame]

1004

for name, chars in UcdFile(NAMED_SEQUENCES, version):

1005

chars = tuple(int(char, 16) for char in chars.split())

1006

# check that the structure defined in makeunicodename is OK

1007

assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"

1008

assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "

1009

"the NamedSequence struct and in unicodedata_lookup")

1010

self.named_sequences.append((name, chars))

1011

# also store these in the PUA 1

1012

self.table[pua_index][1] = name

1013

pua_index += 1

Ezio Melotti

2011-10-21 21:57:36 +0300

[diff] [blame]

1014

assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)

1015

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

1016

self.exclusions = {}

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame]

1017

for char, in UcdFile(COMPOSITION_EXCLUSIONS, version):

1018

char = int(char, 16)

1019

self.exclusions[char] = 1

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

1020

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

1021

widths = [None] * 0x110000

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame]

1022

for s in UcdFile(EASTASIAN_WIDTH, version):

1023

if '..' in s[0]:

1024

first, last = [int(c, 16) for c in s[0].split('..')]

1025

chars = list(range(first, last+1))

1026

else:

1027

chars = [int(s[0], 16)]

1028

for char in chars:

1029

widths[char] = s[1]

Ezio Melotti

2a1e926

2011-09-30 08:46:25 +0300

[diff] [blame]

1030

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

1031

for i in range(0, 0x110000):

1032

if table[i] is not None:

1033

table[i].append(widths[i])

1034

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1035

for i in range(0, 0x110000):

1036

if table[i] is not None:

1037

table[i].append(set())

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1038

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame]

1039

for r, p in UcdFile(DERIVED_CORE_PROPERTIES, version):

1040

if ".." in r:

1041

first, last = [int(c, 16) for c in r.split('..')]

1042

chars = list(range(first, last+1))

else:

chars = [int(r, 16)]

for char in chars:

if table[char]:

# Some properties (e.g. Default_Ignorable_Code_Point)

1048

# apply to unassigned code points; ignore them

1049

table[char][-1].add(p)

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

1050

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame]

1051

for s in UcdFile(LINE_BREAK, version):

1052

if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:

1053

continue

1054

if '..' not in s[0]:

1055

first = last = int(s[0], 16)

1056

else:

1057

first, last = [int(c, 16) for c in s[0].split('..')]

1058

for char in range(first, last+1):

1059

table[char][-1].add('Line_Break')

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

1060

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1061

# We only want the quickcheck properties

1062

# Format: NF?_QC; Y(es)/N(o)/M(aybe)

1063

# Yes is the default, hence only N and M occur

1064

# In 3.2.0, the format was different (NF?_NO)

1065

# The parsing will incorrectly determine these as

1066

# "yes", however, unicodedata.c will not perform quickchecks

1067

# for older versions, and no delta records will be created.

1068

quickchecks = [0] * 0x110000

1069

qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame]

1070

for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version):

1071

if len(s) < 2 or s[1] not in qc_order:

1072

continue

1073

quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No

1074

quickcheck_shift = qc_order.index(s[1])*2

1075

quickcheck <<= quickcheck_shift

1076

if '..' not in s[0]:

1077

first = last = int(s[0], 16)

1078

else:

1079

first, last = [int(c, 16) for c in s[0].split('..')]

1080

for char in range(first, last+1):

1081

assert not (quickchecks[char]>>quickcheck_shift)&3

1082

quickchecks[char] |= quickcheck

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1083

for i in range(0, 0x110000):

1084

if table[i] is not None:

1085

table[i].append(quickchecks[i])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

1086

Ezio Melotti

2a1e926

2011-09-30 08:46:25 +0300

[diff] [blame]

1087

with open_data(UNIHAN, version) as file:

1088

zip = zipfile.ZipFile(file)

1089

if version == '3.2.0':

1090

data = zip.open('Unihan-3.2.0.txt').read()

1091

else:

1092

data = zip.open('Unihan_NumericValues.txt').read()

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

1093

for line in data.decode("utf-8").splitlines():

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

1094

if not line.startswith('U+'):

1095

continue

1096

code, tag, value = line.split(None, 3)[:3]

1097

if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',

1098

'kOtherNumeric'):

1099

continue

1100

value = value.strip().replace(',', '')

1101

i = int(code[2:], 16)

1102

# Patch the numeric field

1103

if table[i] is not None:

1104

table[i][8] = value

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame]

1105

Benjamin Peterson

2012-01-11 18:17:06 -0500

[diff] [blame]

1106

sc = self.special_casing = {}

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame]

1107

for data in UcdFile(SPECIAL_CASING, version):

1108

if data[4]:

1109

# We ignore all conditionals (since they depend on

1110

# languages) except for one, which is hardcoded. See

1111

# handle_capital_sigma in unicodeobject.c.

1112

continue

1113

c = int(data[0], 16)

1114

lower = [int(char, 16) for char in data[1].split()]

1115

title = [int(char, 16) for char in data[2].split()]

1116

upper = [int(char, 16) for char in data[3].split()]

1117

sc[c] = (lower, title, upper)

1118

Benjamin Peterson

2012-01-14 13:23:30 -0500

[diff] [blame]

1119

cf = self.case_folding = {}

1120

if version != '3.2.0':

Greg Price

2019-08-12 22:20:56 -0700

[diff] [blame]

1121

for data in UcdFile(CASE_FOLDING, version):

1122

if data[1] in "CF":

1123

c = int(data[0], 16)

1124

cf[c] = [int(char, 16) for char in data[2].split()]

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

1125

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1126

def uselatin1(self):

1127

# restrict character range to ISO Latin 1

Georg Brandl

bf82e37

2008-05-16 17:02:34 +0000

[diff] [blame]

1128

self.chars = list(range(256))

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1129

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

1130

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1131

# hash table tools

1132

1133

# this is a straight-forward reimplementation of Python's built-in

1134

# dictionary type, using a static data structure, and a custom string

1135

# hash algorithm.

1136

1137

def myhash(s, magic):

1138

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1139

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1140

h = (h * magic) + c

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

1141

ix = h & 0xff000000

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1142

if ix:

1143

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

1144

return h

1145

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

1146

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1147

SIZES = [

1148

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

1149

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

1150

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

1151

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

1152

]

1153

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

1154

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1155

class Hash:

1156

def __init__(self, name, data, magic):

1157

# turn a (key, value) list into a static hash table structure

1158

1159

# determine table size

1160

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

Ezio Melotti

1392500

2011-03-16 11:05:33 +0200

[diff] [blame]

1165

raise AssertionError("ran out of polynomials")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1166

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1167

print(size, "slots in hash table")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1168

1169

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

1178

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

1185

incr = (h ^ (h >> 3)) & mask

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1199

print(n, "collisions")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1200

self.collisions = n

1201

1202

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1212

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1213

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1214

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1215

file.write("#define %s_magic %d\n" % (self.name, self.magic))

1216

file.write("#define %s_size %d\n" % (self.name, self.size))

1217

file.write("#define %s_poly %d\n" % (self.name, self.poly))

1218

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

1219

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1220

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1228

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1229

# write data to file, as a C array

1230

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1231

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1232

print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)

Inada Naoki

6fec905

2019-04-17 08:40:34 +0900

[diff] [blame]

1233

file.write("static const ")

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1234

if size == 1:

1235

file.write("unsigned char")

1236

elif size == 2:

1237

file.write("unsigned short")

1238

else:

1239

file.write("unsigned int")

1240

file.write(" " + self.name + "[] = {\n")

1241

if self.data:

1242

s = " "

1243

for item in self.data:

1244

i = str(item) + ", "

1245

if len(s) + len(i) > 78:

Benjamin Peterson

279a962

2017-06-22 22:31:08 -0700

[diff] [blame]

1246

file.write(s.rstrip() + "\n")

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1247

s = " " + i

1248

else:

1249

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1250

if s.strip():

Benjamin Peterson

279a962

2017-06-22 22:31:08 -0700

[diff] [blame]

1251

file.write(s.rstrip() + "\n")

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1252

file.write("};\n\n")

1253

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

1254

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1255

def getsize(data):

1256

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

1265

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1266

def splitbins(t, trace=0):

1267

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

1268

1269

t is a sequence of ints. This function can be useful to save space if

1270

many of the ints are the same. t1 and t2 are lists of ints, and shift

1271

is an int, chosen to minimize the combined size of t1 and t2 (in C

1272

code), and where for each i in range(len(t)),

1273

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1274

where mask is a bitmask isolating the last "shift" bits.

1275

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1276

If optional arg trace is non-zero (default zero), progress info

1277

is printed to sys.stderr. The higher the value, the more info

1278

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1279

"""

1280

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1281

if trace:

1282

def dump(t1, t2, shift, bytes):

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1283

print("%d+%d bins at shift %d; %d bytes" % (

1284

len(t1), len(t2), shift, bytes), file=sys.stderr)

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

1285

print("Size of original table:", len(t)*getsize(t), "bytes",

1286

file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1287

n = len(t)-1 # last valid index

1288

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

Christian Heimes

a37d4c6

2007-12-04 23:02:19 +0000

[diff] [blame]

1294

bytes = sys.maxsize # smallest total size so far

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1295

t = tuple(t) # so slices can be dict keys

1296

for shift in range(maxshift + 1):

1297

t1 = []

1298

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1299

size = 2**shift

1300

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1301

for i in range(0, len(t), size):

1302

bin = t[i:i+size]

1303

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1304

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1305

index = len(t2)

1306

bincache[bin] = index

1307

t2.extend(bin)

1308

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1309

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1310

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1311

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1312

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1313

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1314

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1315

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1316

t1, t2, shift = best

1317

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1318

print("Best:", end=' ', file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1319

dump(t1, t2, shift, bytes)

1320

if __debug__:

1321

# exhaustively verify that the decomposition is correct

1322

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

Guido van Rossum

805365e

2007-05-07 22:24:25 +0000

[diff] [blame]

1323

for i in range(len(t)):

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1324

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1325

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1326

Stefan Behnel

2019-06-01 21:49:03 +0200

[diff] [blame]

1327

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1328

if __name__ == "__main__":

Fredrik Lundh