Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython3

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

4

# this script converts a unicode 3.2 database file to

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

5

# Modules/unicodedata_db.h, Modules/unicodename_db.h,

6

# and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

7

#

8

# history:

9

# 2000-09-24 fl created (based on bits and pieces from unidb)

10

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

11

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

12

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

13

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

14

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

15

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

16

# 2002-09-11 wd use string methods

17

# 2002-10-18 mvl update to Unicode 3.2

18

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

19

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

20

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

21

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

22

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

23

# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

24

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

25

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

#

import sys

SCRIPT = sys.argv[0]

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

31

VERSION = "2.6"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

32

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

33

# The Unicode Database

Florent Xicluna

faa663f

2010-03-19 13:37:08 +0000

[diff] [blame]

34

UNIDATA_VERSION = "5.2.0"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

35

UNICODE_DATA = "UnicodeData%s.txt"

36

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

37

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

38

UNIHAN = "Unihan%s.txt"

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

39

DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

40

DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

41

LINE_BREAK = "LineBreak%s.txt"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

42

43

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

44

45

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

46

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

47

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

48

"So" ]

49

50

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

51

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

52

"ON" ]

53

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

54

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

55

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

56

MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]

57

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

58

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

63

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

64

SPACE_MASK = 0x20

65

TITLE_MASK = 0x40

66

UPPER_MASK = 0x80

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

67

XID_START_MASK = 0x100

68

XID_CONTINUE_MASK = 0x200

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

69

PRINTABLE_MASK = 0x400

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

70

NODELTA_MASK = 0x800

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

71

NUMERIC_MASK = 0x1000

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

72

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

73

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

74

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

75

print("--- Reading", UNICODE_DATA % "", "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

76

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

77

version = ""

78

unicode = UnicodeData(UNICODE_DATA % version,

79

COMPOSITION_EXCLUSIONS % version,

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

80

EASTASIAN_WIDTH % version,

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

81

UNIHAN % version,

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

82

DERIVED_CORE_PROPERTIES % version,

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

83

DERIVEDNORMALIZATION_PROPS % version,

84

LINE_BREAK % version)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

85

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

86

print(len(list(filter(None, unicode.table))), "characters")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

87

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

88

for version in old_versions:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

89

print("--- Reading", UNICODE_DATA % ("-"+version), "...")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

90

old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),

91

COMPOSITION_EXCLUSIONS % ("-"+version),

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

92

EASTASIAN_WIDTH % ("-"+version),

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

93

UNIHAN % ("-"+version),

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

94

DERIVED_CORE_PROPERTIES % ("-"+version))

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

95

print(len(list(filter(None, old_unicode.table))), "characters")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

96

merge_old_version(version, unicode, old_unicode)

97

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

98

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

99

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

100

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

101

102

# --------------------------------------------------------------------

103

# unicode character properties

104

105

def makeunicodedata(unicode, trace):

106

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

107

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

108

table = [dummy]

109

cache = {0: dummy}

110

index = [0] * len(unicode.chars)

111

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

112

FILE = "Modules/unicodedata_db.h"

113

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

114

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

115

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

116

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

117

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

118

for char in unicode.chars:

119

record = unicode.table[char]

120

if record:

121

# extract database properties

122

category = CATEGORY_NAMES.index(record[2])

123

combining = int(record[3])

124

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

125

mirrored = record[9] == "Y"

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

126

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

127

normalizationquickcheck = record[17]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

128

item = (

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

129

category, combining, bidirectional, mirrored, eastasianwidth,

130

normalizationquickcheck

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

131

)

132

# add entry to index and item tables

133

i = cache.get(item)

134

if i is None:

135

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

139

# 2) decomposition data

140

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

141

decomp_data = [0]

142

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

143

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

144

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

145

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

146

comp_pairs = []

147

comp_first = [None] * len(unicode.chars)

148

comp_last = [None] * len(unicode.chars)

149

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

150

for char in unicode.chars:

151

record = unicode.table[char]

152

if record:

153

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

154

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

155

if len(decomp) > 19:

Collin Winter

2007-08-22 23:05:06 +0000

[diff] [blame]

156

raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

157

# prefix

158

if decomp[0][0] == "<":

159

prefix = decomp.pop(0)

160

else:

161

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

162

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

163

i = decomp_prefix.index(prefix)

164

except ValueError:

165

i = len(decomp_prefix)

166

decomp_prefix.append(prefix)

167

prefix = i

168

assert prefix < 256

169

# content

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

170

decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

171

# Collect NFC pairs

172

if not prefix and len(decomp) == 3 and \

173

char not in unicode.exclusions and \

174

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

179

try:

180

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

181

except ValueError:

182

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

183

decomp_data.extend(decomp)

184

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

185

else:

186

i = 0

187

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

188

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

189

f = l = 0

190

comp_first_ranges = []

191

comp_last_ranges = []

192

prev_f = prev_l = None

193

for i in unicode.chars:

194

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

200

prev_f = prev_f[0],i

201

else:

202

comp_first_ranges.append(prev_f)

203

prev_f = (i,i)

204

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

210

prev_l = prev_l[0],i

211

else:

212

comp_last_ranges.append(prev_l)

213

prev_l = (i,i)

214

comp_first_ranges.append(prev_f)

215

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

220

for f,l,char in comp_pairs:

221

f = comp_first[f]

222

l = comp_last[l]

223

comp_data[f*total_last+l] = char

224

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

225

print(len(table), "unique properties")

226

print(len(decomp_prefix), "unique decomposition prefixes")

227

print(len(decomp_data), "unique decomposition entries:", end=' ')

228

print(decomp_size, "bytes")

229

print(total_first, "first characters in NFC")

230

print(total_last, "last characters in NFC")

231

print(len(comp_pairs), "NFC pairs")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

232

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

233

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

234

Fred Drake

9c68505

2000-10-26 03:56:46 +0000

[diff] [blame]

235

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

236

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

237

print(file=fp)

238

print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)

239

print("/* a list of unique database records */", file=fp)

240

print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

241

for item in table:

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

242

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

243

print("};", file=fp)

244

print(file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

245

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

246

print("/* Reindexing of NFC first characters. */", file=fp)

247

print("#define TOTAL_FIRST",total_first, file=fp)

248

print("#define TOTAL_LAST",total_last, file=fp)

249

print("struct reindex{int start;short count,index;};", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

250

print("static struct reindex nfc_first[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

251

for start,end in comp_first_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

252

print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)

253

print(" {0,0,0}", file=fp)

254

print("};\n", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

255

print("static struct reindex nfc_last[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

256

for start,end in comp_last_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

257

print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)

258

print(" {0,0,0}", file=fp)

259

print("};\n", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

260

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

261

# FIXME: <fl> the following tables could be made static, and

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

262

# the support code moved into unicodedatabase.c

263

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

264

print("/* string literals */", file=fp)

265

print("const char *_PyUnicode_CategoryNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

266

for name in CATEGORY_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

267

print(" \"%s\"," % name, file=fp)

268

print(" NULL", file=fp)

269

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

270

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

271

print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

272

for name in BIDIRECTIONAL_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

273

print(" \"%s\"," % name, file=fp)

274

print(" NULL", file=fp)

275

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

276

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

277

print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

278

for name in EASTASIANWIDTH_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

279

print(" \"%s\"," % name, file=fp)

280

print(" NULL", file=fp)

281

print("};", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

282

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

283

print("static const char *decomp_prefix[] = {", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

284

for name in decomp_prefix:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

285

print(" \"%s\"," % name, file=fp)

286

print(" NULL", file=fp)

287

print("};", file=fp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

288

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

289

# split record index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

290

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

291

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

292

print("/* index tables for the database records */", file=fp)

293

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

294

Array("index1", index1).dump(fp, trace)

295

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

296

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

297

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

298

index1, index2, shift = splitbins(decomp_index, trace)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

299

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

300

print("/* decomposition data */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

301

Array("decomp_data", decomp_data).dump(fp, trace)

302

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

303

print("/* index tables for the decomposition data */", file=fp)

304

print("#define DECOMP_SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

305

Array("decomp_index1", index1).dump(fp, trace)

306

Array("decomp_index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

307

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

308

index, index2, shift = splitbins(comp_data, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

309

print("/* NFC pairs */", file=fp)

310

print("#define COMP_SHIFT", shift, file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

311

Array("comp_index", index).dump(fp, trace)

312

Array("comp_data", index2).dump(fp, trace)

313

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

314

# Generate delta tables for old versions

315

for version, table, normalization in unicode.changed:

316

cversion = version.replace(".","_")

317

records = [table[0]]

318

cache = {table[0]:0}

319

index = [0] * len(table)

320

for i, record in enumerate(table):

321

try:

322

index[i] = cache[record]

323

except KeyError:

324

index[i] = cache[record] = len(records)

325

records.append(record)

326

index1, index2, shift = splitbins(index, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

327

print("static const change_record change_records_%s[] = {" % cversion, file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

328

for record in records:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

329

print("\t{ %s }," % ", ".join(map(str,record)), file=fp)

330

print("};", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

331

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

332

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

333

print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)

334

print("{", file=fp)

335

print("\tint index;", file=fp)

336

print("\tif (n >= 0x110000) index = 0;", file=fp)

337

print("\telse {", file=fp)

338

print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)

339

print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \

340

(cversion, shift, ((1<<shift)-1)), file=fp)

341

print("\t}", file=fp)

342

print("\treturn change_records_%s+index;" % cversion, file=fp)

343

print("}\n", file=fp)

344

print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)

345

print("{", file=fp)

346

print("\tswitch(n) {", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

347

for k, v in normalization:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

348

print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)

349

print("\tdefault: return 0;", file=fp)

350

print("\t}\n}\n", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

351

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

352

fp.close()

353

354

# --------------------------------------------------------------------

355

# unicode character type tables

356

357

def makeunicodetype(unicode, trace):

358

359

FILE = "Objects/unicodetype_db.h"

360

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

361

print("--- Preparing", FILE, "...")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

362

363

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

364

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

365

table = [dummy]

366

cache = {0: dummy}

367

index = [0] * len(unicode.chars)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

368

numeric = {}

369

spaces = []

370

linebreaks = []

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

371

372

for char in unicode.chars:

373

record = unicode.table[char]

374

if record:

375

# extract database properties

376

category = record[2]

377

bidirectional = record[4]

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

378

properties = record[16]

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

379

flags = 0

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

380

delta = True

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

381

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

382

flags |= ALPHA_MASK

383

if category == "Ll":

384

flags |= LOWER_MASK

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

385

if 'Line_Break' in properties or bidirectional == "B":

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

386

flags |= LINEBREAK_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

387

linebreaks.append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

388

if category == "Zs" or bidirectional in ("WS", "B", "S"):

389

flags |= SPACE_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

390

spaces.append(char)

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

391

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

392

flags |= TITLE_MASK

393

if category == "Lu":

394

flags |= UPPER_MASK

Benjamin Peterson

0983274

2009-03-26 17:15:46 +0000

[diff] [blame]

395

if char == ord(" ") or category[0] not in ("C", "Z"):

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

396

flags |= PRINTABLE_MASK

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

397

if "XID_Start" in properties:

398

flags |= XID_START_MASK

399

if "XID_Continue" in properties:

400

flags |= XID_CONTINUE_MASK

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

401

# use delta predictor for upper/lower/title if it fits

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

402

if record[12]:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

403

upper = int(record[12], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

404

else:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

405

upper = char

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

406

if record[13]:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

407

lower = int(record[13], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

408

else:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

409

lower = char

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

410

if record[14]:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

411

title = int(record[14], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

412

else:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

413

# UCD.html says that a missing title char means that

414

# it defaults to the uppercase character, not to the

415

# character itself. Apparently, in the current UCD (5.x)

416

# this feature is never used

417

title = upper

418

upper_d = upper - char

419

lower_d = lower - char

420

title_d = title - char

421

if -32768 <= upper_d <= 32767 and \

422

-32768 <= lower_d <= 32767 and \

423

-32768 <= title_d <= 32767:

424

# use deltas

425

upper = upper_d & 0xffff

426

lower = lower_d & 0xffff

427

title = title_d & 0xffff

428

else:

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

429

flags |= NODELTA_MASK

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

430

# decimal digit, integer digit

431

decimal = 0

432

if record[6]:

433

flags |= DECIMAL_MASK

434

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

439

if record[8]:

440

flags |= NUMERIC_MASK

441

numeric.setdefault(record[8], []).append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

442

item = (

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

443

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

444

)

445

# add entry to index and item tables

446

i = cache.get(item)

447

if i is None:

448

cache[item] = i = len(table)

table.append(item)

index[char] = i

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

452

print(len(table), "unique character type entries")

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

453

print(sum(map(len, numeric.values())), "numeric code points")

454

print(len(spaces), "whitespace code points")

455

print(len(linebreaks), "linebreak code points")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

456

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

457

print("--- Writing", FILE, "...")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

458

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

459

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

460

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

461

print(file=fp)

462

print("/* a list of unique character type descriptors */", file=fp)

463

print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

464

for item in table:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

465

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

466

print("};", file=fp)

467

print(file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

468

469

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

470

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

471

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

472

print("/* type indexes */", file=fp)

473

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

474

Array("index1", index1).dump(fp, trace)

475

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

476

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

477

# Generate code for _PyUnicode_ToNumeric()

478

numeric_items = sorted(numeric.items())

479

print('/* Returns the numeric value as double for Unicode characters', file=fp)

480

print(' * having this property, -1.0 otherwise.', file=fp)

481

print(' */', file=fp)

482

print('double _PyUnicode_ToNumeric(Py_UNICODE ch)', file=fp)

483

print('{', file=fp)

484

print(' switch (ch) {', file=fp)

485

for value, codepoints in numeric_items:

Amaury Forgeot d'Arc

919765a

2009-10-13 23:18:53 +0000

[diff] [blame]

486

# Turn text into float literals

487

parts = value.split('/')

488

parts = [repr(float(part)) for part in parts]

489

value = '/'.join(parts)

490

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

haswide = False

hasnonewide = False

codepoints.sort()

for codepoint in codepoints:

495

if codepoint < 0x10000:

496

hasnonewide = True

497

if codepoint >= 0x10000 and not haswide:

498

print('#ifdef Py_UNICODE_WIDE', file=fp)

499

haswide = True

500

print(' case 0x%04X:' % (codepoint,), file=fp)

501

if haswide and hasnonewide:

502

print('#endif', file=fp)

503

print(' return (double) %s;' % (value,), file=fp)

504

if haswide and not hasnonewide:

505

print('#endif', file=fp)

506

print(' }', file=fp)

507

print(' return -1.0;', file=fp)

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsWhitespace()

512

print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)

513

print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)

514

print(" */", file=fp)

515

print('int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)', file=fp)

516

print('{', file=fp)

517

print('#ifdef WANT_WCTYPE_FUNCTIONS', file=fp)

518

print(' return iswspace(ch);', file=fp)

519

print('#else', file=fp)

520

print(' switch (ch) {', file=fp)

521

522

haswide = False

523

hasnonewide = False

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

524

for codepoint in sorted(spaces):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

525

if codepoint < 0x10000:

526

hasnonewide = True

527

if codepoint >= 0x10000 and not haswide:

528

print('#ifdef Py_UNICODE_WIDE', file=fp)

529

haswide = True

530

print(' case 0x%04X:' % (codepoint,), file=fp)

531

if haswide and hasnonewide:

532

print('#endif', file=fp)

533

print(' return 1;', file=fp)

534

if haswide and not hasnonewide:

535

print('#endif', file=fp)

536

537

print(' }', file=fp)

538

print(' return 0;', file=fp)

539

print('#endif', file=fp)

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsLinebreak()

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

544

print("/* Returns 1 for Unicode characters having the line break", file=fp)

545

print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)

546

print(" * type 'B', 0 otherwise.", file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

547

print(" */", file=fp)

548

print('int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)', file=fp)

549

print('{', file=fp)

550

print(' switch (ch) {', file=fp)

551

haswide = False

552

hasnonewide = False

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

553

for codepoint in sorted(linebreaks):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

554

if codepoint < 0x10000:

555

hasnonewide = True

556

if codepoint >= 0x10000 and not haswide:

557

print('#ifdef Py_UNICODE_WIDE', file=fp)

558

haswide = True

559

print(' case 0x%04X:' % (codepoint,), file=fp)

560

if haswide and hasnonewide:

561

print('#endif', file=fp)

562

print(' return 1;', file=fp)

563

if haswide and not hasnonewide:

564

print('#endif', file=fp)

565

566

print(' }', file=fp)

567

print(' return 0;', file=fp)

print('}', file=fp)

print(file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

571

fp.close()

572

573

# --------------------------------------------------------------------

574

# unicode name database

575

576

def makeunicodename(unicode, trace):

577

578

FILE = "Modules/unicodename_db.h"

579

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

580

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

581

582

# collect names

583

names = [None] * len(unicode.chars)

584

585

for char in unicode.chars:

586

record = unicode.table[char]

587

if record:

588

name = record[1].strip()

589

if name and name[0] != "<":

590

names[char] = name + chr(0)

591

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

592

print(len(list(n for n in names if n is not None)), "distinct names")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

593

594

# collect unique words from names (note that we differ between

595

# words inside a sentence, and words ending a sentence. the

596

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

612

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

613

print(n, "words in text;", b, "bytes")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

614

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

615

wordlist = list(words.items())

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

616

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

617

# sort on falling frequency, then by name

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

618

def word_key(a):

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

619

aword, alist = a

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

620

return -len(alist), aword

621

wordlist.sort(key=word_key)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

622

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

623

# figure out how many phrasebook escapes we need

624

escapes = 0

625

while escapes * 256 < len(wordlist):

626

escapes = escapes + 1

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

627

print(escapes, "escapes")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

628

629

short = 256 - escapes

assert short > 0

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

633

print(short, "short indexes in lexicon")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

634

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

635

# statistics

636

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

637

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

638

n = n + len(wordlist[i][1])

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

639

print(n, "short indexes in phrasebook")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

640

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

641

# pick the most commonly used words, and sort the rest on falling

642

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

643

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

644

wordlist, wordtail = wordlist[:short], wordlist[short:]

Raymond Hettinger

d4cb56d

2008-01-30 02:55:10 +0000

[diff] [blame]

645

wordtail.sort(key=lambda a: a[0], reverse=True)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

646

wordlist.extend(wordtail)

647

648

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

655

offset = 0

656

for w, x in wordlist:

657

# encoding: bit 7 indicates last character in word (chr(128)

658

# indicates the last character in an entire string)

659

ww = w[:-1] + chr(ord(w[-1])+128)

660

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

661

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

662

if o < 0:

663

o = offset

664

lexicon = lexicon + ww

665

offset = offset + len(w)

666

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

667

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

668

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

669

lexicon = list(map(ord, lexicon))

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

670

671

# generate phrasebook from names and lexicon

672

phrasebook = [0]

673

phrasebook_offset = [0] * len(unicode.chars)

674

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

679

for w in w:

680

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

681

if i < short:

682

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

683

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

684

# store as two bytes

685

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

686

phrasebook.append(i&255)

687

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

688

assert getsize(phrasebook) == 1

689

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

690

#

691

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

696

record = unicode.table[char]

697

if record:

698

name = record[1].strip()

699

if name and name[0] != "<":

700

data.append((name, char))

701

702

# the magic number 47 was chosen to minimize the number of

703

# collisions on the current data set. if you like, change it

704

# and see what happens...

705

706

codehash = Hash("code", data, 47)

707

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

708

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

709

710

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

711

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

712

print(file=fp)

713

print("#define NAME_MAXLEN", 256, file=fp)

714

print(file=fp)

715

print("/* lexicon */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

716

Array("lexicon", lexicon).dump(fp, trace)

717

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

718

719

# split decomposition index table

720

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

721

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

722

print("/* code->name phrasebook */", file=fp)

723

print("#define phrasebook_shift", shift, file=fp)

724

print("#define phrasebook_short", short, file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

725

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

726

Array("phrasebook", phrasebook).dump(fp, trace)

727

Array("phrasebook_offset1", offset1).dump(fp, trace)

728

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

729

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

730

print("/* name->code dictionary */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

731

codehash.dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

fp.close()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

735

736

def merge_old_version(version, new, old):

737

# Changes to exclusion file not implemented yet

738

if old.exclusions != new.exclusions:

Collin Winter

2007-08-22 23:05:06 +0000

[diff] [blame]

739

raise NotImplementedError("exclusions differ")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

740

741

# In these change records, 0xFF means "no change"

742

bidir_changes = [0xFF]*0x110000

743

category_changes = [0xFF]*0x110000

744

decimal_changes = [0xFF]*0x110000

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

745

mirrored_changes = [0xFF]*0x110000

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

746

# In numeric data, 0 means "no change",

747

# -1 means "did not have a numeric value

748

numeric_changes = [0] * 0x110000

749

# normalization_changes is a list of key-value pairs

750

normalization_changes = []

751

for i in range(0x110000):

752

if new.table[i] is None:

753

# Characters unassigned in the new version ought to

754

# be unassigned in the old one

755

assert old.table[i] is None

756

continue

757

# check characters unassigned in the old version

758

if old.table[i] is None:

759

# category 0 is "unassigned"

760

category_changes[i] = 0

761

continue

762

# check characters that differ

763

if old.table[i] != new.table[i]:

764

for k in range(len(old.table[i])):

765

if old.table[i][k] != new.table[i][k]:

766

value = old.table[i][k]

767

if k == 2:

768

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

769

category_changes[i] = CATEGORY_NAMES.index(value)

770

elif k == 4:

771

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

772

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

773

elif k == 5:

774

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

775

# We assume that all normalization changes are in 1:1 mappings

776

assert " " not in value

777

normalization_changes.append((i, value))

778

elif k == 6:

779

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

780

# we only support changes where the old value is a single digit

781

assert value in "0123456789"

782

decimal_changes[i] = int(value)

783

elif k == 8:

784

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

785

# Since 0 encodes "no change", the old value is better not 0

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

786

if not value:

787

numeric_changes[i] = -1

788

else:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

789

numeric_changes[i] = float(value)

790

assert numeric_changes[i] not in (0, -1)

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

791

elif k == 9:

792

if value == 'Y':

793

mirrored_changes[i] = '1'

794

else:

795

mirrored_changes[i] = '0'

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

796

elif k == 11:

797

# change to ISO comment, ignore

798

pass

799

elif k == 12:

800

# change to simple uppercase mapping; ignore

801

pass

802

elif k == 13:

803

# change to simple lowercase mapping; ignore

804

pass

805

elif k == 14:

806

# change to simple titlecase mapping; ignore

807

pass

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

808

elif k == 16:

809

# derived property changes; not yet

810

pass

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

811

else:

812

class Difference(Exception):pass

Collin Winter

2007-08-22 23:05:06 +0000

[diff] [blame]

813

raise Difference(hex(i), k, old.table[i], new.table[i])

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

814

new.changed.append((version, list(zip(bidir_changes, category_changes,

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

815

decimal_changes, mirrored_changes,

816

numeric_changes)),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

817

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

818

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

819

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

820

# --------------------------------------------------------------------

821

# the following support code is taken from the unidb utilities

822

823

824

# load a unicode-data file from disk

825

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

826

class UnicodeData:

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

827

# Record structure:

828

# [ID, name, category, combining, bidi, decomp, (6)

829

# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)

830

# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)

831

# derived-props] (17)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

832

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

833

def __init__(self, filename, exclusions, eastasianwidth, unihan,

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

834

derivedprops, derivednormalizationprops=None, linebreakprops=None,

835

expand=1):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

836

self.changed = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

837

file = open(filename)

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

838

table = [None] * 0x110000

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

while 1:

s = file.readline()

if not s:

break

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

843

s = s.strip().split(";")

844

char = int(s[0], 16)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

845

table[char] = s

846

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

847

# expand first-last ranges

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

848

if expand:

849

field = None

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

850

for i in range(0, 0x110000):

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

851

s = table[i]

852

if s:

853

if s[1][-6:] == "First>":

854

s[1] = ""

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

855

field = s

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

856

elif s[1][-5:] == "Last>":

857

s[1] = ""

858

field = None

859

elif field:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

860

f2 = field[:]

861

f2[0] = "%X" % i

862

table[i] = f2

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

863

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

864

# public attributes

865

self.filename = filename

866

self.table = table

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

867

self.chars = list(range(0x110000)) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

868

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

869

file = open(exclusions)

self.exclusions = {}

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

char = int(s.split()[0],16)

878

self.exclusions[char] = 1

879

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

880

widths = [None] * 0x110000

881

for s in open(eastasianwidth):

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

s = s.split()[0].split(';')

888

if '..' in s[0]:

889

first, last = [int(c, 16) for c in s[0].split('..')]

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

890

chars = list(range(first, last+1))

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

891

else:

892

chars = [int(s[0], 16)]

893

for char in chars:

894

widths[char] = s[1]

895

for i in range(0, 0x110000):

896

if table[i] is not None:

897

table[i].append(widths[i])

898

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

899

for i in range(0, 0x110000):

900

if table[i] is not None:

901

table[i].append(set())

902

for s in open(derivedprops):

903

s = s.split('#', 1)[0].strip()

if not s:

continue

r, p = s.split(";")

r = r.strip()

p = p.strip()

if ".." in r:

first, last = [int(c, 16) for c in r.split('..')]

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

912

chars = list(range(first, last+1))

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

else:

chars = [int(r, 16)]

for char in chars:

if table[char]:

# Some properties (e.g. Default_Ignorable_Code_Point)

918

# apply to unassigned code points; ignore them

919

table[char][-1].add(p)

920

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

921

if linebreakprops:

922

for s in open(linebreakprops):

923

s = s.partition('#')[0]

924

s = [i.strip() for i in s.split(';')]

925

if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:

926

continue

927

if '..' not in s[0]:

928

first = last = int(s[0], 16)

929

else:

930

first, last = [int(c, 16) for c in s[0].split('..')]

931

for char in range(first, last+1):

932

table[char][-1].add('Line_Break')

933

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

934

if derivednormalizationprops:

935

quickchecks = [0] * 0x110000 # default is Yes

936

qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()

937

for s in open(derivednormalizationprops):

938

if '#' in s:

939

s = s[:s.index('#')]

940

s = [i.strip() for i in s.split(';')]

941

if len(s) < 2 or s[1] not in qc_order:

942

continue

943

quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No

944

quickcheck_shift = qc_order.index(s[1])*2

945

quickcheck <<= quickcheck_shift

946

if '..' not in s[0]:

947

first = last = int(s[0], 16)

948

else:

949

first, last = [int(c, 16) for c in s[0].split('..')]

950

for char in range(first, last+1):

951

assert not (quickchecks[char]>>quickcheck_shift)&3

952

quickchecks[char] |= quickcheck

953

for i in range(0, 0x110000):

954

if table[i] is not None:

955

table[i].append(quickchecks[i])

956

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

957

for line in open(unihan, encoding='utf-8'):

958

if not line.startswith('U+'):

959

continue

960

code, tag, value = line.split(None, 3)[:3]

961

if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',

962

'kOtherNumeric'):

963

continue

964

value = value.strip().replace(',', '')

965

i = int(code[2:], 16)

966

# Patch the numeric field

967

if table[i] is not None:

968

table[i][8] = value

969

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

970

def uselatin1(self):

971

# restrict character range to ISO Latin 1

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

972

self.chars = list(range(256))

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

973

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

974

# hash table tools

975

976

# this is a straight-forward reimplementation of Python's built-in

977

# dictionary type, using a static data structure, and a custom string

978

# hash algorithm.

979

980

def myhash(s, magic):

981

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

982

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

983

h = (h * magic) + c

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

984

ix = h & 0xff000000

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

985

if ix:

986

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

return h

SIZES = [

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

991

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

992

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

993

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

]

class Hash:

def __init__(self, name, data, magic):

998

# turn a (key, value) list into a static hash table structure

999

1000

# determine table size

1001

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

Collin Winter

2007-08-22 23:05:06 +0000

[diff] [blame]

1006

raise AssertionError("ran out of polynominals")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1007

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1008

print(size, "slots in hash table")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1009

1010

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

1019

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

incr = (h ^ (h >> 3)) & mask;

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1040

print(n, "collisions")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1041

self.collisions = n

1042

1043

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1053

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1054

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1055

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1056

file.write("#define %s_magic %d\n" % (self.name, self.magic))

1057

file.write("#define %s_size %d\n" % (self.name, self.size))

1058

file.write("#define %s_poly %d\n" % (self.name, self.poly))

1059

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1060

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1068

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1069

# write data to file, as a C array

1070

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1071

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1072

print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1073

file.write("static ")

1074

if size == 1:

1075

file.write("unsigned char")

1076

elif size == 2:

1077

file.write("unsigned short")

1078

else:

1079

file.write("unsigned int")

1080

file.write(" " + self.name + "[] = {\n")

1081

if self.data:

1082

s = " "

1083

for item in self.data:

1084

i = str(item) + ", "

1085

if len(s) + len(i) > 78:

file.write(s + "\n")

s = " " + i

else:

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1090

if s.strip():

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

file.write(s + "\n")

file.write("};\n\n")

def getsize(data):

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1104

def splitbins(t, trace=0):

1105

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

1106

1107

t is a sequence of ints. This function can be useful to save space if

1108

many of the ints are the same. t1 and t2 are lists of ints, and shift

1109

is an int, chosen to minimize the combined size of t1 and t2 (in C

1110

code), and where for each i in range(len(t)),

1111

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1112

where mask is a bitmask isolating the last "shift" bits.

1113

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1114

If optional arg trace is non-zero (default zero), progress info

1115

is printed to sys.stderr. The higher the value, the more info

1116

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1117

"""

1118

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1119

if trace:

1120

def dump(t1, t2, shift, bytes):

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1121

print("%d+%d bins at shift %d; %d bytes" % (

1122

len(t1), len(t2), shift, bytes), file=sys.stderr)

1123

print("Size of original table:", len(t)*getsize(t), \

1124

"bytes", file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1125

n = len(t)-1 # last valid index

1126

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

Christian Heimes

a37d4c6

2007-12-04 23:02:19 +0000

[diff] [blame]

1132

bytes = sys.maxsize # smallest total size so far

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1133

t = tuple(t) # so slices can be dict keys

1134

for shift in range(maxshift + 1):

1135

t1 = []

1136

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1137

size = 2**shift

1138

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1139

for i in range(0, len(t), size):

1140

bin = t[i:i+size]

1141

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1142

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1143

index = len(t2)

1144

bincache[bin] = index

1145

t2.extend(bin)

1146

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1147

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1148

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1149

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1150

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1151

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1152

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1153

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1154

t1, t2, shift = best

1155

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1156

print("Best:", end=' ', file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1157

dump(t1, t2, shift, bytes)

1158

if __debug__:

1159

# exhaustively verify that the decomposition is correct

1160

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

Guido van Rossum

805365e

2007-05-07 22:24:25 +0000

[diff] [blame]

1161

for i in range(len(t)):

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1162

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1163

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1164

1165

if __name__ == "__main__":

Fredrik Lundh