Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython3

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

4

# this script converts a unicode 3.2 database file to

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

5

# Modules/unicodedata_db.h, Modules/unicodename_db.h,

6

# and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

7

#

8

# history:

9

# 2000-09-24 fl created (based on bits and pieces from unidb)

10

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

11

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

12

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

13

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

14

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

15

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

16

# 2002-09-11 wd use string methods

17

# 2002-10-18 mvl update to Unicode 3.2

18

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

19

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

20

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

21

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

22

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

23

# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

24

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

25

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

#

import sys

SCRIPT = sys.argv[0]

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

31

VERSION = "2.6"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

32

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

33

# The Unicode Database

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

34

UNIDATA_VERSION = "5.1.0"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

35

UNICODE_DATA = "UnicodeData%s.txt"

36

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

37

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

38

DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

39

40

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

41

42

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

43

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

44

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

45

"So" ]

46

47

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

48

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

49

"ON" ]

50

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

51

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

52

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

53

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

58

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

59

SPACE_MASK = 0x20

60

TITLE_MASK = 0x40

61

UPPER_MASK = 0x80

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

62

XID_START_MASK = 0x100

63

XID_CONTINUE_MASK = 0x200

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

64

PRINTABLE_MASK = 0x400

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

65

NODELTA_MASK = 0x800

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

66

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

67

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

68

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

69

print("--- Reading", UNICODE_DATA % "", "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

70

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

71

version = ""

72

unicode = UnicodeData(UNICODE_DATA % version,

73

COMPOSITION_EXCLUSIONS % version,

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

74

EASTASIAN_WIDTH % version,

75

DERIVED_CORE_PROPERTIES % version)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

76

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

77

print(len(list(filter(None, unicode.table))), "characters")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

78

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

79

for version in old_versions:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

80

print("--- Reading", UNICODE_DATA % ("-"+version), "...")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

81

old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),

82

COMPOSITION_EXCLUSIONS % ("-"+version),

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

83

EASTASIAN_WIDTH % ("-"+version),

84

DERIVED_CORE_PROPERTIES % ("-"+version))

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

85

print(len(list(filter(None, old_unicode.table))), "characters")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

86

merge_old_version(version, unicode, old_unicode)

87

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

88

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

89

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

90

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

91

92

# --------------------------------------------------------------------

93

# unicode character properties

94

95

def makeunicodedata(unicode, trace):

96

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

97

dummy = (0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

98

table = [dummy]

99

cache = {0: dummy}

100

index = [0] * len(unicode.chars)

101

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

102

FILE = "Modules/unicodedata_db.h"

103

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

104

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

105

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

106

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

107

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

108

for char in unicode.chars:

109

record = unicode.table[char]

110

if record:

111

# extract database properties

112

category = CATEGORY_NAMES.index(record[2])

113

combining = int(record[3])

114

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

115

mirrored = record[9] == "Y"

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

116

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

117

item = (

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

118

category, combining, bidirectional, mirrored, eastasianwidth

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

119

)

120

# add entry to index and item tables

121

i = cache.get(item)

122

if i is None:

123

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

127

# 2) decomposition data

128

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

129

decomp_data = [0]

130

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

131

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

132

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

133

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

134

comp_pairs = []

135

comp_first = [None] * len(unicode.chars)

136

comp_last = [None] * len(unicode.chars)

137

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

138

for char in unicode.chars:

139

record = unicode.table[char]

140

if record:

141

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

142

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

143

if len(decomp) > 19:

Collin Winter

2007-08-22 23:05:06 +0000

[diff] [blame]

144

raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

145

# prefix

146

if decomp[0][0] == "<":

147

prefix = decomp.pop(0)

148

else:

149

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

150

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

151

i = decomp_prefix.index(prefix)

152

except ValueError:

153

i = len(decomp_prefix)

154

decomp_prefix.append(prefix)

155

prefix = i

156

assert prefix < 256

157

# content

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

158

decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

159

# Collect NFC pairs

160

if not prefix and len(decomp) == 3 and \

161

char not in unicode.exclusions and \

162

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

167

try:

168

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

169

except ValueError:

170

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

171

decomp_data.extend(decomp)

172

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

173

else:

174

i = 0

175

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

176

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

177

f = l = 0

178

comp_first_ranges = []

179

comp_last_ranges = []

180

prev_f = prev_l = None

181

for i in unicode.chars:

182

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

188

prev_f = prev_f[0],i

189

else:

190

comp_first_ranges.append(prev_f)

191

prev_f = (i,i)

192

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

198

prev_l = prev_l[0],i

199

else:

200

comp_last_ranges.append(prev_l)

201

prev_l = (i,i)

202

comp_first_ranges.append(prev_f)

203

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

208

for f,l,char in comp_pairs:

209

f = comp_first[f]

210

l = comp_last[l]

211

comp_data[f*total_last+l] = char

212

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

213

print(len(table), "unique properties")

214

print(len(decomp_prefix), "unique decomposition prefixes")

215

print(len(decomp_data), "unique decomposition entries:", end=' ')

216

print(decomp_size, "bytes")

217

print(total_first, "first characters in NFC")

218

print(total_last, "last characters in NFC")

219

print(len(comp_pairs), "NFC pairs")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

220

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

221

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

222

Fred Drake

9c68505

2000-10-26 03:56:46 +0000

[diff] [blame]

223

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

224

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

225

print(file=fp)

226

print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)

227

print("/* a list of unique database records */", file=fp)

228

print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

229

for item in table:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

230

print(" {%d, %d, %d, %d, %d}," % item, file=fp)

231

print("};", file=fp)

232

print(file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

233

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

234

print("/* Reindexing of NFC first characters. */", file=fp)

235

print("#define TOTAL_FIRST",total_first, file=fp)

236

print("#define TOTAL_LAST",total_last, file=fp)

237

print("struct reindex{int start;short count,index;};", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

238

print("static struct reindex nfc_first[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

239

for start,end in comp_first_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

240

print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)

241

print(" {0,0,0}", file=fp)

242

print("};\n", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

243

print("static struct reindex nfc_last[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

244

for start,end in comp_last_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

245

print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)

246

print(" {0,0,0}", file=fp)

247

print("};\n", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

248

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

249

# FIXME: <fl> the following tables could be made static, and

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

250

# the support code moved into unicodedatabase.c

251

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

252

print("/* string literals */", file=fp)

253

print("const char *_PyUnicode_CategoryNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

254

for name in CATEGORY_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

255

print(" \"%s\"," % name, file=fp)

256

print(" NULL", file=fp)

257

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

258

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

259

print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

260

for name in BIDIRECTIONAL_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

261

print(" \"%s\"," % name, file=fp)

262

print(" NULL", file=fp)

263

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

264

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

265

print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

266

for name in EASTASIANWIDTH_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

267

print(" \"%s\"," % name, file=fp)

268

print(" NULL", file=fp)

269

print("};", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

270

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

271

print("static const char *decomp_prefix[] = {", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

272

for name in decomp_prefix:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

273

print(" \"%s\"," % name, file=fp)

274

print(" NULL", file=fp)

275

print("};", file=fp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

276

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

277

# split record index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

278

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

279

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

280

print("/* index tables for the database records */", file=fp)

281

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

282

Array("index1", index1).dump(fp, trace)

283

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

284

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

285

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

286

index1, index2, shift = splitbins(decomp_index, trace)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

287

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

288

print("/* decomposition data */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

289

Array("decomp_data", decomp_data).dump(fp, trace)

290

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

291

print("/* index tables for the decomposition data */", file=fp)

292

print("#define DECOMP_SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

293

Array("decomp_index1", index1).dump(fp, trace)

294

Array("decomp_index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

295

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

296

index, index2, shift = splitbins(comp_data, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

297

print("/* NFC pairs */", file=fp)

298

print("#define COMP_SHIFT", shift, file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

299

Array("comp_index", index).dump(fp, trace)

300

Array("comp_data", index2).dump(fp, trace)

301

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

302

# Generate delta tables for old versions

303

for version, table, normalization in unicode.changed:

304

cversion = version.replace(".","_")

305

records = [table[0]]

306

cache = {table[0]:0}

307

index = [0] * len(table)

308

for i, record in enumerate(table):

309

try:

310

index[i] = cache[record]

311

except KeyError:

312

index[i] = cache[record] = len(records)

313

records.append(record)

314

index1, index2, shift = splitbins(index, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

315

print("static const change_record change_records_%s[] = {" % cversion, file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

316

for record in records:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

317

print("\t{ %s }," % ", ".join(map(str,record)), file=fp)

318

print("};", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

319

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

320

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

321

print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)

322

print("{", file=fp)

323

print("\tint index;", file=fp)

324

print("\tif (n >= 0x110000) index = 0;", file=fp)

325

print("\telse {", file=fp)

326

print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)

327

print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \

328

(cversion, shift, ((1<<shift)-1)), file=fp)

329

print("\t}", file=fp)

330

print("\treturn change_records_%s+index;" % cversion, file=fp)

331

print("}\n", file=fp)

332

print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)

333

print("{", file=fp)

334

print("\tswitch(n) {", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

335

for k, v in normalization:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

336

print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)

337

print("\tdefault: return 0;", file=fp)

338

print("\t}\n}\n", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

339

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

340

fp.close()

341

342

# --------------------------------------------------------------------

343

# unicode character type tables

344

345

def makeunicodetype(unicode, trace):

346

347

FILE = "Objects/unicodetype_db.h"

348

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

349

print("--- Preparing", FILE, "...")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

350

351

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

352

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

353

table = [dummy]

354

cache = {0: dummy}

355

index = [0] * len(unicode.chars)

356

357

for char in unicode.chars:

358

record = unicode.table[char]

359

if record:

360

# extract database properties

361

category = record[2]

362

bidirectional = record[4]

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

363

properties = record[16]

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

364

flags = 0

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

365

delta = True

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

366

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

367

flags |= ALPHA_MASK

368

if category == "Ll":

369

flags |= LOWER_MASK

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

370

if category == "Zl" or bidirectional == "B":

371

flags |= LINEBREAK_MASK

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

372

if category == "Zs" or bidirectional in ("WS", "B", "S"):

373

flags |= SPACE_MASK

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

374

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

375

flags |= TITLE_MASK

376

if category == "Lu":

377

flags |= UPPER_MASK

Benjamin Peterson

0983274

2009-03-26 17:15:46 +0000

[diff] [blame]

378

if char == ord(" ") or category[0] not in ("C", "Z"):

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

379

flags |= PRINTABLE_MASK

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

380

if "XID_Start" in properties:

381

flags |= XID_START_MASK

382

if "XID_Continue" in properties:

383

flags |= XID_CONTINUE_MASK

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

384

# use delta predictor for upper/lower/title if it fits

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

385

if record[12]:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame^]

386

upper = int(record[12], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

387

else:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame^]

388

upper = char

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

389

if record[13]:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame^]

390

lower = int(record[13], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

391

else:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame^]

392

lower = char

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

393

if record[14]:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame^]

394

title = int(record[14], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

395

else:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame^]

396

# UCD.html says that a missing title char means that

397

# it defaults to the uppercase character, not to the

398

# character itself. Apparently, in the current UCD (5.x)

399

# this feature is never used

400

title = upper

401

upper_d = upper - char

402

lower_d = lower - char

403

title_d = title - char

404

if -32768 <= upper_d <= 32767 and \

405

-32768 <= lower_d <= 32767 and \

406

-32768 <= title_d <= 32767:

407

# use deltas

408

upper = upper_d & 0xffff

409

lower = lower_d & 0xffff

410

title = title_d & 0xffff

411

else:

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

412

flags |= NODELTA_MASK

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

413

# decimal digit, integer digit

414

decimal = 0

415

if record[6]:

416

flags |= DECIMAL_MASK

417

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

422

item = (

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

423

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

424

)

425

# add entry to index and item tables

426

i = cache.get(item)

427

if i is None:

428

cache[item] = i = len(table)

table.append(item)

index[char] = i

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

432

print(len(table), "unique character type entries")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

433

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

434

print("--- Writing", FILE, "...")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

435

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

436

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

437

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

438

print(file=fp)

439

print("/* a list of unique character type descriptors */", file=fp)

440

print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

441

for item in table:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

442

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

443

print("};", file=fp)

444

print(file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

445

446

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

447

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

448

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

449

print("/* type indexes */", file=fp)

450

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

451

Array("index1", index1).dump(fp, trace)

452

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

453

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

454

fp.close()

455

456

# --------------------------------------------------------------------

457

# unicode name database

458

459

def makeunicodename(unicode, trace):

460

461

FILE = "Modules/unicodename_db.h"

462

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

463

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

464

465

# collect names

466

names = [None] * len(unicode.chars)

467

468

for char in unicode.chars:

469

record = unicode.table[char]

470

if record:

471

name = record[1].strip()

472

if name and name[0] != "<":

473

names[char] = name + chr(0)

474

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

475

print(len(list(n for n in names if n is not None)), "distinct names")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

476

477

# collect unique words from names (note that we differ between

478

# words inside a sentence, and words ending a sentence. the

479

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

495

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

496

print(n, "words in text;", b, "bytes")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

497

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

498

wordlist = list(words.items())

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

499

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

500

# sort on falling frequency, then by name

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

501

def word_key(a):

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

502

aword, alist = a

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

503

return -len(alist), aword

504

wordlist.sort(key=word_key)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

505

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

506

# figure out how many phrasebook escapes we need

507

escapes = 0

508

while escapes * 256 < len(wordlist):

509

escapes = escapes + 1

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

510

print(escapes, "escapes")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

511

512

short = 256 - escapes

assert short > 0

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

516

print(short, "short indexes in lexicon")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

517

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

518

# statistics

519

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

520

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

521

n = n + len(wordlist[i][1])

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

522

print(n, "short indexes in phrasebook")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

523

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

524

# pick the most commonly used words, and sort the rest on falling

525

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

526

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

527

wordlist, wordtail = wordlist[:short], wordlist[short:]

Raymond Hettinger

d4cb56d

2008-01-30 02:55:10 +0000

[diff] [blame]

528

wordtail.sort(key=lambda a: a[0], reverse=True)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

529

wordlist.extend(wordtail)

530

531

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

538

offset = 0

539

for w, x in wordlist:

540

# encoding: bit 7 indicates last character in word (chr(128)

541

# indicates the last character in an entire string)

542

ww = w[:-1] + chr(ord(w[-1])+128)

543

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

544

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

545

if o < 0:

546

o = offset

547

lexicon = lexicon + ww

548

offset = offset + len(w)

549

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

550

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

551

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

552

lexicon = list(map(ord, lexicon))

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

553

554

# generate phrasebook from names and lexicon

555

phrasebook = [0]

556

phrasebook_offset = [0] * len(unicode.chars)

557

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

562

for w in w:

563

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

564

if i < short:

565

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

566

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

567

# store as two bytes

568

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

569

phrasebook.append(i&255)

570

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

571

assert getsize(phrasebook) == 1

572

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

573

#

574

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

579

record = unicode.table[char]

580

if record:

581

name = record[1].strip()

582

if name and name[0] != "<":

583

data.append((name, char))

584

585

# the magic number 47 was chosen to minimize the number of

586

# collisions on the current data set. if you like, change it

587

# and see what happens...

588

589

codehash = Hash("code", data, 47)

590

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

591

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

592

593

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

594

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

595

print(file=fp)

596

print("#define NAME_MAXLEN", 256, file=fp)

597

print(file=fp)

598

print("/* lexicon */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

599

Array("lexicon", lexicon).dump(fp, trace)

600

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

601

602

# split decomposition index table

603

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

604

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

605

print("/* code->name phrasebook */", file=fp)

606

print("#define phrasebook_shift", shift, file=fp)

607

print("#define phrasebook_short", short, file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

608

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

609

Array("phrasebook", phrasebook).dump(fp, trace)

610

Array("phrasebook_offset1", offset1).dump(fp, trace)

611

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

612

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

613

print("/* name->code dictionary */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

614

codehash.dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

fp.close()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

618

619

def merge_old_version(version, new, old):

620

# Changes to exclusion file not implemented yet

621

if old.exclusions != new.exclusions:

Collin Winter

2007-08-22 23:05:06 +0000

[diff] [blame]

622

raise NotImplementedError("exclusions differ")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

623

624

# In these change records, 0xFF means "no change"

625

bidir_changes = [0xFF]*0x110000

626

category_changes = [0xFF]*0x110000

627

decimal_changes = [0xFF]*0x110000

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

628

mirrored_changes = [0xFF]*0x110000

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

629

# In numeric data, 0 means "no change",

630

# -1 means "did not have a numeric value

631

numeric_changes = [0] * 0x110000

632

# normalization_changes is a list of key-value pairs

633

normalization_changes = []

634

for i in range(0x110000):

635

if new.table[i] is None:

636

# Characters unassigned in the new version ought to

637

# be unassigned in the old one

638

assert old.table[i] is None

639

continue

640

# check characters unassigned in the old version

641

if old.table[i] is None:

642

# category 0 is "unassigned"

643

category_changes[i] = 0

644

continue

645

# check characters that differ

646

if old.table[i] != new.table[i]:

647

for k in range(len(old.table[i])):

648

if old.table[i][k] != new.table[i][k]:

649

value = old.table[i][k]

650

if k == 2:

651

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

652

category_changes[i] = CATEGORY_NAMES.index(value)

653

elif k == 4:

654

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

655

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

656

elif k == 5:

657

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

658

# We assume that all normalization changes are in 1:1 mappings

659

assert " " not in value

660

normalization_changes.append((i, value))

661

elif k == 6:

662

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

663

# we only support changes where the old value is a single digit

664

assert value in "0123456789"

665

decimal_changes[i] = int(value)

666

elif k == 8:

667

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

668

# Since 0 encodes "no change", the old value is better not 0

669

assert value != "0" and value != "-1"

670

if not value:

671

numeric_changes[i] = -1

672

else:

673

assert re.match("^[0-9]+$", value)

674

numeric_changes[i] = int(value)

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

675

elif k == 9:

676

if value == 'Y':

677

mirrored_changes[i] = '1'

678

else:

679

mirrored_changes[i] = '0'

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

680

elif k == 11:

681

# change to ISO comment, ignore

682

pass

683

elif k == 12:

684

# change to simple uppercase mapping; ignore

685

pass

686

elif k == 13:

687

# change to simple lowercase mapping; ignore

688

pass

689

elif k == 14:

690

# change to simple titlecase mapping; ignore

691

pass

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

692

elif k == 16:

693

# derived property changes; not yet

694

pass

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

695

else:

696

class Difference(Exception):pass

Collin Winter

2007-08-22 23:05:06 +0000

[diff] [blame]

697

raise Difference(hex(i), k, old.table[i], new.table[i])

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

698

new.changed.append((version, list(zip(bidir_changes, category_changes,

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

699

decimal_changes, mirrored_changes,

700

numeric_changes)),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

701

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

702

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

703

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

704

# --------------------------------------------------------------------

705

# the following support code is taken from the unidb utilities

706

707

708

# load a unicode-data file from disk

709

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

710

import sys

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

711

712

class UnicodeData:

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

713

# Record structure:

714

# [ID, name, category, combining, bidi, decomp, (6)

715

# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)

716

# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)

717

# derived-props] (17)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

718

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

719

def __init__(self, filename, exclusions, eastasianwidth,

720

derivedprops, expand=1):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

721

self.changed = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

722

file = open(filename)

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

723

table = [None] * 0x110000

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

while 1:

s = file.readline()

if not s:

break

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

728

s = s.strip().split(";")

729

char = int(s[0], 16)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

730

table[char] = s

731

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

732

# expand first-last ranges

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

733

if expand:

734

field = None

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

735

for i in range(0, 0x110000):

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

736

s = table[i]

737

if s:

738

if s[1][-6:] == "First>":

739

s[1] = ""

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

740

field = s

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

741

elif s[1][-5:] == "Last>":

742

s[1] = ""

743

field = None

744

elif field:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

745

f2 = field[:]

746

f2[0] = "%X" % i

747

table[i] = f2

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

748

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

749

# public attributes

750

self.filename = filename

751

self.table = table

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

752

self.chars = list(range(0x110000)) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

753

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

754

file = open(exclusions)

self.exclusions = {}

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

char = int(s.split()[0],16)

763

self.exclusions[char] = 1

764

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

765

widths = [None] * 0x110000

766

for s in open(eastasianwidth):

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

s = s.split()[0].split(';')

773

if '..' in s[0]:

774

first, last = [int(c, 16) for c in s[0].split('..')]

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

775

chars = list(range(first, last+1))

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

776

else:

777

chars = [int(s[0], 16)]

778

for char in chars:

779

widths[char] = s[1]

780

for i in range(0, 0x110000):

781

if table[i] is not None:

782

table[i].append(widths[i])

783

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

784

for i in range(0, 0x110000):

785

if table[i] is not None:

786

table[i].append(set())

787

for s in open(derivedprops):

788

s = s.split('#', 1)[0].strip()

if not s:

continue

r, p = s.split(";")

r = r.strip()

p = p.strip()

if ".." in r:

first, last = [int(c, 16) for c in r.split('..')]

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

797

chars = list(range(first, last+1))

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

else:

chars = [int(r, 16)]

for char in chars:

if table[char]:

# Some properties (e.g. Default_Ignorable_Code_Point)

803

# apply to unassigned code points; ignore them

804

table[char][-1].add(p)

805

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

806

def uselatin1(self):

807

# restrict character range to ISO Latin 1

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

808

self.chars = list(range(256))

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

809

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

810

# hash table tools

811

812

# this is a straight-forward reimplementation of Python's built-in

813

# dictionary type, using a static data structure, and a custom string

814

# hash algorithm.

815

816

def myhash(s, magic):

817

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

818

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

819

h = (h * magic) + c

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

820

ix = h & 0xff000000

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

821

if ix:

822

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

return h

SIZES = [

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

827

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

828

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

829

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

]

class Hash:

def __init__(self, name, data, magic):

834

# turn a (key, value) list into a static hash table structure

835

836

# determine table size

837

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

Collin Winter

2007-08-22 23:05:06 +0000

[diff] [blame]

842

raise AssertionError("ran out of polynominals")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

843

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

844

print(size, "slots in hash table")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

845

846

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

855

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

incr = (h ^ (h >> 3)) & mask;

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

876

print(n, "collisions")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

877

self.collisions = n

878

879

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

889

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

890

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

891

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

892

file.write("#define %s_magic %d\n" % (self.name, self.magic))

893

file.write("#define %s_size %d\n" % (self.name, self.size))

894

file.write("#define %s_poly %d\n" % (self.name, self.poly))

895

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

896

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

904

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

905

# write data to file, as a C array

906

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

907

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

908

print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

909

file.write("static ")

910

if size == 1:

911

file.write("unsigned char")

912

elif size == 2:

913

file.write("unsigned short")

914

else:

915

file.write("unsigned int")

916

file.write(" " + self.name + "[] = {\n")

917

if self.data:

918

s = " "

919

for item in self.data:

920

i = str(item) + ", "

921

if len(s) + len(i) > 78:

file.write(s + "\n")

s = " " + i

else:

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

926

if s.strip():

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

file.write(s + "\n")

file.write("};\n\n")

def getsize(data):

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

940

def splitbins(t, trace=0):

941

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

942

943

t is a sequence of ints. This function can be useful to save space if

944

many of the ints are the same. t1 and t2 are lists of ints, and shift

945

is an int, chosen to minimize the combined size of t1 and t2 (in C

946

code), and where for each i in range(len(t)),

947

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

948

where mask is a bitmask isolating the last "shift" bits.

949

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

950

If optional arg trace is non-zero (default zero), progress info

951

is printed to sys.stderr. The higher the value, the more info

952

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

"""

import sys

if trace:

def dump(t1, t2, shift, bytes):

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

958

print("%d+%d bins at shift %d; %d bytes" % (

959

len(t1), len(t2), shift, bytes), file=sys.stderr)

960

print("Size of original table:", len(t)*getsize(t), \

961

"bytes", file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

962

n = len(t)-1 # last valid index

963

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

Christian Heimes

a37d4c6

2007-12-04 23:02:19 +0000

[diff] [blame]

969

bytes = sys.maxsize # smallest total size so far

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

970

t = tuple(t) # so slices can be dict keys

971

for shift in range(maxshift + 1):

972

t1 = []

973

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

974

size = 2**shift

975

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

976

for i in range(0, len(t), size):

977

bin = t[i:i+size]

978

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

979

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

980

index = len(t2)

981

bincache[bin] = index

982

t2.extend(bin)

983

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

984

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

985

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

986

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

987

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

988

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

989

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

990

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

991

t1, t2, shift = best

992

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

993

print("Best:", end=' ', file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

994

dump(t1, t2, shift, bytes)

995

if __debug__:

996

# exhaustively verify that the decomposition is correct

997

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

Guido van Rossum

805365e

2007-05-07 22:24:25 +0000

[diff] [blame]

998

for i in range(len(t)):

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

999

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1000

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1001

1002

if __name__ == "__main__":

Fredrik Lundh