Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython2

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

4

# this script converts a unicode 3.2 database file to

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

5

# Modules/unicodedata_db.h, Modules/unicodename_db.h,

6

# and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

7

#

8

# history:

9

# 2000-09-24 fl created (based on bits and pieces from unidb)

10

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

11

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

12

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

13

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

14

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

15

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

16

# 2002-09-11 wd use string methods

17

# 2002-10-18 mvl update to Unicode 3.2

18

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

19

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

20

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

21

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

22

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Georg Brandl

2008-06-11 18:37:52 +0000

[diff] [blame]

23

# 2008-06-11 gb add NONPRINTABLE_MASK for Atsuo Ishimoto's ascii() patch

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

24

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

25

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

#

import sys

SCRIPT = sys.argv[0]

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

31

VERSION = "2.5"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

32

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

33

# The Unicode Database

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

34

UNIDATA_VERSION = "4.1.0"

35

UNICODE_DATA = "UnicodeData%s.txt"

36

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

37

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

38

DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

39

40

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

41

42

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

43

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

44

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

45

"So" ]

46

47

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

48

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

49

"ON" ]

50

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

51

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

52

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

53

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

58

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

59

SPACE_MASK = 0x20

60

TITLE_MASK = 0x40

61

UPPER_MASK = 0x80

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

62

XID_START_MASK = 0x100

63

XID_CONTINUE_MASK = 0x200

Georg Brandl

2008-06-11 18:37:52 +0000

[diff] [blame]

64

NONPRINTABLE_MASK = 0x400

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

65

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

66

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

67

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

68

print("--- Reading", UNICODE_DATA % "", "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

69

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

70

version = ""

71

unicode = UnicodeData(UNICODE_DATA % version,

72

COMPOSITION_EXCLUSIONS % version,

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

73

EASTASIAN_WIDTH % version,

74

DERIVED_CORE_PROPERTIES % version)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

75

Georg Brandl

2008-06-11 18:37:52 +0000

[diff] [blame]

76

print(len(list(filter(None, unicode.table))), "characters")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

77

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

78

for version in old_versions:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

79

print("--- Reading", UNICODE_DATA % ("-"+version), "...")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

80

old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),

81

COMPOSITION_EXCLUSIONS % ("-"+version),

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

82

EASTASIAN_WIDTH % ("-"+version),

83

DERIVED_CORE_PROPERTIES % ("-"+version))

Georg Brandl

2008-06-11 18:37:52 +0000

[diff] [blame]

84

print(len(list(filter(None, old_unicode.table))), "characters")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

85

merge_old_version(version, unicode, old_unicode)

86

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

87

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

88

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

89

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

90

91

# --------------------------------------------------------------------

92

# unicode character properties

93

94

def makeunicodedata(unicode, trace):

95

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

96

dummy = (0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

97

table = [dummy]

98

cache = {0: dummy}

99

index = [0] * len(unicode.chars)

100

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

101

FILE = "Modules/unicodedata_db.h"

102

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

103

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

104

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

105

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

106

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

107

for char in unicode.chars:

108

record = unicode.table[char]

109

if record:

110

# extract database properties

111

category = CATEGORY_NAMES.index(record[2])

112

combining = int(record[3])

113

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

114

mirrored = record[9] == "Y"

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

115

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

116

item = (

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

117

category, combining, bidirectional, mirrored, eastasianwidth

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

118

)

119

# add entry to index and item tables

120

i = cache.get(item)

121

if i is None:

122

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

126

# 2) decomposition data

127

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

128

decomp_data = [0]

129

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

130

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

131

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

132

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

133

comp_pairs = []

134

comp_first = [None] * len(unicode.chars)

135

comp_last = [None] * len(unicode.chars)

136

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

137

for char in unicode.chars:

138

record = unicode.table[char]

139

if record:

140

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

141

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

142

if len(decomp) > 19:

Collin Winter

2007-08-22 23:05:06 +0000

[diff] [blame]

143

raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

144

# prefix

145

if decomp[0][0] == "<":

146

prefix = decomp.pop(0)

147

else:

148

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

149

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

150

i = decomp_prefix.index(prefix)

151

except ValueError:

152

i = len(decomp_prefix)

153

decomp_prefix.append(prefix)

154

prefix = i

155

assert prefix < 256

156

# content

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

157

decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

158

# Collect NFC pairs

159

if not prefix and len(decomp) == 3 and \

160

char not in unicode.exclusions and \

161

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

166

try:

167

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

168

except ValueError:

169

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

170

decomp_data.extend(decomp)

171

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

172

else:

173

i = 0

174

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

175

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

176

f = l = 0

177

comp_first_ranges = []

178

comp_last_ranges = []

179

prev_f = prev_l = None

180

for i in unicode.chars:

181

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

187

prev_f = prev_f[0],i

188

else:

189

comp_first_ranges.append(prev_f)

190

prev_f = (i,i)

191

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

197

prev_l = prev_l[0],i

198

else:

199

comp_last_ranges.append(prev_l)

200

prev_l = (i,i)

201

comp_first_ranges.append(prev_f)

202

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

207

for f,l,char in comp_pairs:

208

f = comp_first[f]

209

l = comp_last[l]

210

comp_data[f*total_last+l] = char

211

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

212

print(len(table), "unique properties")

213

print(len(decomp_prefix), "unique decomposition prefixes")

214

print(len(decomp_data), "unique decomposition entries:", end=' ')

215

print(decomp_size, "bytes")

216

print(total_first, "first characters in NFC")

217

print(total_last, "last characters in NFC")

218

print(len(comp_pairs), "NFC pairs")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

219

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

220

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

221

Fred Drake

9c68505

2000-10-26 03:56:46 +0000

[diff] [blame]

222

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

223

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

224

print(file=fp)

225

print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)

226

print("/* a list of unique database records */", file=fp)

227

print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

228

for item in table:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

229

print(" {%d, %d, %d, %d, %d}," % item, file=fp)

230

print("};", file=fp)

231

print(file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

232

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

233

print("/* Reindexing of NFC first characters. */", file=fp)

234

print("#define TOTAL_FIRST",total_first, file=fp)

235

print("#define TOTAL_LAST",total_last, file=fp)

236

print("struct reindex{int start;short count,index;};", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

237

print("static struct reindex nfc_first[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

238

for start,end in comp_first_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

239

print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)

240

print(" {0,0,0}", file=fp)

241

print("};\n", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

242

print("static struct reindex nfc_last[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

243

for start,end in comp_last_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

244

print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)

245

print(" {0,0,0}", file=fp)

246

print("};\n", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

247

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

248

# FIXME: <fl> the following tables could be made static, and

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

249

# the support code moved into unicodedatabase.c

250

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

251

print("/* string literals */", file=fp)

252

print("const char *_PyUnicode_CategoryNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

253

for name in CATEGORY_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

254

print(" \"%s\"," % name, file=fp)

255

print(" NULL", file=fp)

256

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

257

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

258

print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

259

for name in BIDIRECTIONAL_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

260

print(" \"%s\"," % name, file=fp)

261

print(" NULL", file=fp)

262

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

263

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

264

print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

265

for name in EASTASIANWIDTH_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

266

print(" \"%s\"," % name, file=fp)

267

print(" NULL", file=fp)

268

print("};", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

269

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

270

print("static const char *decomp_prefix[] = {", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

271

for name in decomp_prefix:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

272

print(" \"%s\"," % name, file=fp)

273

print(" NULL", file=fp)

274

print("};", file=fp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

275

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

276

# split record index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

277

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

278

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

279

print("/* index tables for the database records */", file=fp)

280

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

281

Array("index1", index1).dump(fp, trace)

282

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

283

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

284

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

285

index1, index2, shift = splitbins(decomp_index, trace)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

286

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

287

print("/* decomposition data */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

288

Array("decomp_data", decomp_data).dump(fp, trace)

289

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

290

print("/* index tables for the decomposition data */", file=fp)

291

print("#define DECOMP_SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

292

Array("decomp_index1", index1).dump(fp, trace)

293

Array("decomp_index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

294

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

295

index, index2, shift = splitbins(comp_data, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

296

print("/* NFC pairs */", file=fp)

297

print("#define COMP_SHIFT", shift, file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

298

Array("comp_index", index).dump(fp, trace)

299

Array("comp_data", index2).dump(fp, trace)

300

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

301

# Generate delta tables for old versions

302

for version, table, normalization in unicode.changed:

303

cversion = version.replace(".","_")

304

records = [table[0]]

305

cache = {table[0]:0}

306

index = [0] * len(table)

307

for i, record in enumerate(table):

308

try:

309

index[i] = cache[record]

310

except KeyError:

311

index[i] = cache[record] = len(records)

312

records.append(record)

313

index1, index2, shift = splitbins(index, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

314

print("static const change_record change_records_%s[] = {" % cversion, file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

315

for record in records:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

316

print("\t{ %s }," % ", ".join(map(str,record)), file=fp)

317

print("};", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

318

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

319

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

320

print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)

321

print("{", file=fp)

322

print("\tint index;", file=fp)

323

print("\tif (n >= 0x110000) index = 0;", file=fp)

324

print("\telse {", file=fp)

325

print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)

326

print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \

327

(cversion, shift, ((1<<shift)-1)), file=fp)

328

print("\t}", file=fp)

329

print("\treturn change_records_%s+index;" % cversion, file=fp)

330

print("}\n", file=fp)

331

print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)

332

print("{", file=fp)

333

print("\tswitch(n) {", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

334

for k, v in normalization:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

335

print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)

336

print("\tdefault: return 0;", file=fp)

337

print("\t}\n}\n", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

338

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

339

fp.close()

340

341

# --------------------------------------------------------------------

342

# unicode character type tables

343

344

def makeunicodetype(unicode, trace):

345

346

FILE = "Objects/unicodetype_db.h"

347

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

348

print("--- Preparing", FILE, "...")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

349

350

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

351

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

352

table = [dummy]

353

cache = {0: dummy}

354

index = [0] * len(unicode.chars)

355

356

for char in unicode.chars:

357

record = unicode.table[char]

358

if record:

359

# extract database properties

360

category = record[2]

361

bidirectional = record[4]

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

362

properties = record[16]

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

363

flags = 0

364

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

365

flags |= ALPHA_MASK

366

if category == "Ll":

367

flags |= LOWER_MASK

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

368

if category == "Zl" or bidirectional == "B":

369

flags |= LINEBREAK_MASK

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

370

if category == "Zs" or bidirectional in ("WS", "B", "S"):

371

flags |= SPACE_MASK

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

372

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

373

flags |= TITLE_MASK

374

if category == "Lu":

375

flags |= UPPER_MASK

Georg Brandl

2008-06-11 18:37:52 +0000

[diff] [blame]

376

if category[0] == "C":

377

flags |= NONPRINTABLE_MASK

378

if category[0] == "Z" and char != " ":

379

flags |= NONPRINTABLE_MASK

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

380

if "XID_Start" in properties:

381

flags |= XID_START_MASK

382

if "XID_Continue" in properties:

383

flags |= XID_CONTINUE_MASK

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

384

# use delta predictor for upper/lower/title

385

if record[12]:

Martin v. Löwis

99ac328

2002-10-18 17:34:18 +0000

[diff] [blame]

386

upper = int(record[12], 16) - char

387

assert -32768 <= upper <= 32767

388

upper = upper & 0xffff

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

389

else:

390

upper = 0

391

if record[13]:

Martin v. Löwis

99ac328

2002-10-18 17:34:18 +0000

[diff] [blame]

392

lower = int(record[13], 16) - char

393

assert -32768 <= lower <= 32767

394

lower = lower & 0xffff

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

395

else:

396

lower = 0

397

if record[14]:

Martin v. Löwis

99ac328

2002-10-18 17:34:18 +0000

[diff] [blame]

398

title = int(record[14], 16) - char

399

assert -32768 <= lower <= 32767

400

title = title & 0xffff

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

401

else:

402

title = 0

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

403

# decimal digit, integer digit

404

decimal = 0

405

if record[6]:

406

flags |= DECIMAL_MASK

407

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

412

item = (

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

413

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

414

)

415

# add entry to index and item tables

416

i = cache.get(item)

417

if i is None:

418

cache[item] = i = len(table)

table.append(item)

index[char] = i

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

422

print(len(table), "unique character type entries")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

423

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

424

print("--- Writing", FILE, "...")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

425

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

426

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

427

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

428

print(file=fp)

429

print("/* a list of unique character type descriptors */", file=fp)

430

print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

431

for item in table:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

432

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

433

print("};", file=fp)

434

print(file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

435

436

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

437

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

438

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

439

print("/* type indexes */", file=fp)

440

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

441

Array("index1", index1).dump(fp, trace)

442

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

443

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

444

fp.close()

445

446

# --------------------------------------------------------------------

447

# unicode name database

448

Raymond Hettinger

d4cb56d

2008-01-30 02:55:10 +0000

[diff] [blame]

449

def CmpToKey(mycmp):

450

'Convert a cmp= function into a key= function'

451

class K(object):

452

def __init__(self, obj, *args):

453

self.obj = obj

454

def __lt__(self, other):

455

return mycmp(self.obj, other.obj) == -1

456

return K

457

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

458

def makeunicodename(unicode, trace):

459

460

FILE = "Modules/unicodename_db.h"

461

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

462

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

463

464

# collect names

465

names = [None] * len(unicode.chars)

466

467

for char in unicode.chars:

468

record = unicode.table[char]

469

if record:

470

name = record[1].strip()

471

if name and name[0] != "<":

472

names[char] = name + chr(0)

473

Georg Brandl

2008-06-11 18:37:52 +0000

[diff] [blame]

474

print(len(list(n for n in names if n is not None)), "distinct names")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

475

476

# collect unique words from names (note that we differ between

477

# words inside a sentence, and words ending a sentence. the

478

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

494

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

495

print(n, "words in text;", b, "bytes")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

496

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

497

wordlist = list(words.items())

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

498

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

499

# sort on falling frequency, then by name

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

500

def cmpwords(a,b):

501

aword, alist = a

502

bword, blist = b

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

503

r = -cmp(len(alist),len(blist))

504

if r:

505

return r

506

return cmp(aword, bword)

Raymond Hettinger

d4cb56d

2008-01-30 02:55:10 +0000

[diff] [blame]

507

wordlist.sort(key=CmpToKey(cmpwords))

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

508

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

509

# figure out how many phrasebook escapes we need

510

escapes = 0

511

while escapes * 256 < len(wordlist):

512

escapes = escapes + 1

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

513

print(escapes, "escapes")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

514

515

short = 256 - escapes

assert short > 0

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

519

print(short, "short indexes in lexicon")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

520

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

521

# statistics

522

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

523

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

524

n = n + len(wordlist[i][1])

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

525

print(n, "short indexes in phrasebook")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

526

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

527

# pick the most commonly used words, and sort the rest on falling

528

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

529

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

530

wordlist, wordtail = wordlist[:short], wordlist[short:]

Raymond Hettinger

d4cb56d

2008-01-30 02:55:10 +0000

[diff] [blame]

531

wordtail.sort(key=lambda a: a[0], reverse=True)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

532

wordlist.extend(wordtail)

533

534

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

541

offset = 0

542

for w, x in wordlist:

543

# encoding: bit 7 indicates last character in word (chr(128)

544

# indicates the last character in an entire string)

545

ww = w[:-1] + chr(ord(w[-1])+128)

546

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

547

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

548

if o < 0:

549

o = offset

550

lexicon = lexicon + ww

551

offset = offset + len(w)

552

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

553

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

554

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

555

lexicon = list(map(ord, lexicon))

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

556

557

# generate phrasebook from names and lexicon

558

phrasebook = [0]

559

phrasebook_offset = [0] * len(unicode.chars)

560

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

565

for w in w:

566

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

567

if i < short:

568

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

569

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

570

# store as two bytes

571

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

572

phrasebook.append(i&255)

573

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

574

assert getsize(phrasebook) == 1

575

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

576

#

577

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

582

record = unicode.table[char]

583

if record:

584

name = record[1].strip()

585

if name and name[0] != "<":

586

data.append((name, char))

587

588

# the magic number 47 was chosen to minimize the number of

589

# collisions on the current data set. if you like, change it

590

# and see what happens...

591

592

codehash = Hash("code", data, 47)

593

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

594

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

595

596

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

597

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

598

print(file=fp)

599

print("#define NAME_MAXLEN", 256, file=fp)

600

print(file=fp)

601

print("/* lexicon */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

602

Array("lexicon", lexicon).dump(fp, trace)

603

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

604

605

# split decomposition index table

606

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

607

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

608

print("/* code->name phrasebook */", file=fp)

609

print("#define phrasebook_shift", shift, file=fp)

610

print("#define phrasebook_short", short, file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

611

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

612

Array("phrasebook", phrasebook).dump(fp, trace)

613

Array("phrasebook_offset1", offset1).dump(fp, trace)

614

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

615

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

616

print("/* name->code dictionary */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

617

codehash.dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

fp.close()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

621

622

def merge_old_version(version, new, old):

623

# Changes to exclusion file not implemented yet

624

if old.exclusions != new.exclusions:

Collin Winter

2007-08-22 23:05:06 +0000

[diff] [blame]

625

raise NotImplementedError("exclusions differ")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

626

627

# In these change records, 0xFF means "no change"

628

bidir_changes = [0xFF]*0x110000

629

category_changes = [0xFF]*0x110000

630

decimal_changes = [0xFF]*0x110000

631

# In numeric data, 0 means "no change",

632

# -1 means "did not have a numeric value

633

numeric_changes = [0] * 0x110000

634

# normalization_changes is a list of key-value pairs

635

normalization_changes = []

636

for i in range(0x110000):

637

if new.table[i] is None:

638

# Characters unassigned in the new version ought to

639

# be unassigned in the old one

640

assert old.table[i] is None

641

continue

642

# check characters unassigned in the old version

643

if old.table[i] is None:

644

# category 0 is "unassigned"

645

category_changes[i] = 0

646

continue

647

# check characters that differ

648

if old.table[i] != new.table[i]:

649

for k in range(len(old.table[i])):

650

if old.table[i][k] != new.table[i][k]:

651

value = old.table[i][k]

652

if k == 2:

653

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

654

category_changes[i] = CATEGORY_NAMES.index(value)

655

elif k == 4:

656

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

657

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

658

elif k == 5:

659

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

660

# We assume that all normalization changes are in 1:1 mappings

661

assert " " not in value

662

normalization_changes.append((i, value))

663

elif k == 6:

664

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

665

# we only support changes where the old value is a single digit

666

assert value in "0123456789"

667

decimal_changes[i] = int(value)

668

elif k == 8:

669

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

670

# Since 0 encodes "no change", the old value is better not 0

671

assert value != "0" and value != "-1"

672

if not value:

673

numeric_changes[i] = -1

674

else:

675

assert re.match("^[0-9]+$", value)

676

numeric_changes[i] = int(value)

677

elif k == 11:

678

# change to ISO comment, ignore

679

pass

680

elif k == 12:

681

# change to simple uppercase mapping; ignore

682

pass

683

elif k == 13:

684

# change to simple lowercase mapping; ignore

685

pass

686

elif k == 14:

687

# change to simple titlecase mapping; ignore

688

pass

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

689

elif k == 16:

690

# derived property changes; not yet

691

pass

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

692

else:

693

class Difference(Exception):pass

Collin Winter

2007-08-22 23:05:06 +0000

[diff] [blame]

694

raise Difference(hex(i), k, old.table[i], new.table[i])

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

695

new.changed.append((version, list(zip(bidir_changes, category_changes,

696

decimal_changes, numeric_changes)),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

697

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

698

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

699

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

700

# --------------------------------------------------------------------

701

# the following support code is taken from the unidb utilities

702

703

704

# load a unicode-data file from disk

705

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

706

import sys

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

707

708

class UnicodeData:

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

709

# Record structure:

710

# [ID, name, category, combining, bidi, decomp, (6)

711

# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)

712

# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)

713

# derived-props] (17)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

714

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

715

def __init__(self, filename, exclusions, eastasianwidth,

716

derivedprops, expand=1):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

717

self.changed = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

718

file = open(filename)

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

719

table = [None] * 0x110000

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

while 1:

s = file.readline()

if not s:

break

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

724

s = s.strip().split(";")

725

char = int(s[0], 16)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

726

table[char] = s

727

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

728

# expand first-last ranges

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

729

if expand:

730

field = None

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

731

for i in range(0, 0x110000):

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

732

s = table[i]

733

if s:

734

if s[1][-6:] == "First>":

735

s[1] = ""

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

736

field = s

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

737

elif s[1][-5:] == "Last>":

738

s[1] = ""

739

field = None

740

elif field:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

741

f2 = field[:]

742

f2[0] = "%X" % i

743

table[i] = f2

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

744

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

745

# public attributes

746

self.filename = filename

747

self.table = table

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

748

self.chars = list(range(0x110000)) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

749

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

750

file = open(exclusions)

self.exclusions = {}

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

char = int(s.split()[0],16)

759

self.exclusions[char] = 1

760

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

761

widths = [None] * 0x110000

762

for s in open(eastasianwidth):

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

s = s.split()[0].split(';')

769

if '..' in s[0]:

770

first, last = [int(c, 16) for c in s[0].split('..')]

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

771

chars = list(range(first, last+1))

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

772

else:

773

chars = [int(s[0], 16)]

774

for char in chars:

775

widths[char] = s[1]

776

for i in range(0, 0x110000):

777

if table[i] is not None:

778

table[i].append(widths[i])

779

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

780

for i in range(0, 0x110000):

781

if table[i] is not None:

782

table[i].append(set())

783

for s in open(derivedprops):

784

s = s.split('#', 1)[0].strip()

if not s:

continue

r, p = s.split(";")

r = r.strip()

p = p.strip()

if ".." in r:

first, last = [int(c, 16) for c in r.split('..')]

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

793

chars = list(range(first, last+1))

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

else:

chars = [int(r, 16)]

for char in chars:

if table[char]:

# Some properties (e.g. Default_Ignorable_Code_Point)

799

# apply to unassigned code points; ignore them

800

table[char][-1].add(p)

801

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

802

def uselatin1(self):

803

# restrict character range to ISO Latin 1

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

804

self.chars = list(range(256))

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

805

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

806

# hash table tools

807

808

# this is a straight-forward reimplementation of Python's built-in

809

# dictionary type, using a static data structure, and a custom string

810

# hash algorithm.

811

812

def myhash(s, magic):

813

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

814

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

815

h = (h * magic) + c

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

816

ix = h & 0xff000000

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

817

if ix:

818

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

return h

SIZES = [

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

823

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

824

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

825

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

]

class Hash:

def __init__(self, name, data, magic):

830

# turn a (key, value) list into a static hash table structure

831

832

# determine table size

833

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

Collin Winter

2007-08-22 23:05:06 +0000

[diff] [blame]

838

raise AssertionError("ran out of polynominals")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

839

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

840

print(size, "slots in hash table")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

841

842

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

851

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

incr = (h ^ (h >> 3)) & mask;

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

872

print(n, "collisions")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

873

self.collisions = n

874

875

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

885

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

886

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

887

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

888

file.write("#define %s_magic %d\n" % (self.name, self.magic))

889

file.write("#define %s_size %d\n" % (self.name, self.size))

890

file.write("#define %s_poly %d\n" % (self.name, self.poly))

891

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

892

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

900

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

901

# write data to file, as a C array

902

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

903

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

904

print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

905

file.write("static ")

906

if size == 1:

907

file.write("unsigned char")

908

elif size == 2:

909

file.write("unsigned short")

910

else:

911

file.write("unsigned int")

912

file.write(" " + self.name + "[] = {\n")

913

if self.data:

914

s = " "

915

for item in self.data:

916

i = str(item) + ", "

917

if len(s) + len(i) > 78:

file.write(s + "\n")

s = " " + i

else:

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

922

if s.strip():

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

file.write(s + "\n")

file.write("};\n\n")

def getsize(data):

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

936

def splitbins(t, trace=0):

937

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

938

939

t is a sequence of ints. This function can be useful to save space if

940

many of the ints are the same. t1 and t2 are lists of ints, and shift

941

is an int, chosen to minimize the combined size of t1 and t2 (in C

942

code), and where for each i in range(len(t)),

943

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

944

where mask is a bitmask isolating the last "shift" bits.

945

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

946

If optional arg trace is non-zero (default zero), progress info

947

is printed to sys.stderr. The higher the value, the more info

948

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

"""

import sys

if trace:

def dump(t1, t2, shift, bytes):

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

954

print("%d+%d bins at shift %d; %d bytes" % (

955

len(t1), len(t2), shift, bytes), file=sys.stderr)

956

print("Size of original table:", len(t)*getsize(t), \

957

"bytes", file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

958

n = len(t)-1 # last valid index

959

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

Christian Heimes

a37d4c6

2007-12-04 23:02:19 +0000

[diff] [blame]

965

bytes = sys.maxsize # smallest total size so far

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

966

t = tuple(t) # so slices can be dict keys

967

for shift in range(maxshift + 1):

968

t1 = []

969

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

970

size = 2**shift

971

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

972

for i in range(0, len(t), size):

973

bin = t[i:i+size]

974

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

975

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

976

index = len(t2)

977

bincache[bin] = index

978

t2.extend(bin)

979

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

980

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

981

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

982

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

983

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

984

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

985

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

986

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

987

t1, t2, shift = best

988

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

989

print("Best:", end=' ', file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

990

dump(t1, t2, shift, bytes)

991

if __debug__:

992

# exhaustively verify that the decomposition is correct

993

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

Guido van Rossum

805365e

2007-05-07 22:24:25 +0000

[diff] [blame]

994

for i in range(len(t)):

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

995

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

996

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

997

998

if __name__ == "__main__":

Fredrik Lundh