Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython2

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

4

# this script converts a unicode 3.2 database file to

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

5

# Modules/unicodedata_db.h, Modules/unicodename_db.h,

6

# and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

7

#

8

# history:

9

# 2000-09-24 fl created (based on bits and pieces from unidb)

10

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

11

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

12

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

13

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

14

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

15

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

16

# 2002-09-11 wd use string methods

17

# 2002-10-18 mvl update to Unicode 3.2

18

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

19

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

20

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

21

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

22

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

23

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

24

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

#

import sys

SCRIPT = sys.argv[0]

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

30

VERSION = "2.6"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

31

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

32

# The Unicode Database

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

33

UNIDATA_VERSION = "5.1.0"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

34

UNICODE_DATA = "UnicodeData%s.txt"

35

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

36

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

37

UNIHAN = "Unihan%s.txt"

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

38

DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

39

40

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

41

42

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

43

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

44

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

45

"So" ]

46

47

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

48

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

49

"ON" ]

50

Hye-Shik Chang

e9ddfbb

2004-08-04 07:38:35 +0000

[diff] [blame]

51

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

52

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

53

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

58

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

59

SPACE_MASK = 0x20

60

TITLE_MASK = 0x40

61

UPPER_MASK = 0x80

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

62

NODELTA_MASK = 0x100

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

63

NUMERIC_MASK = 0x200

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

64

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

65

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

66

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

67

print "--- Reading", UNICODE_DATA % "", "..."

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

68

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

69

version = ""

70

unicode = UnicodeData(UNICODE_DATA % version,

71

COMPOSITION_EXCLUSIONS % version,

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

72

EASTASIAN_WIDTH % version,

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

73

UNIHAN % version,

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

74

DERIVEDNORMALIZATION_PROPS % version)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

75

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

76

print len(filter(None, unicode.table)), "characters"

77

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

78

for version in old_versions:

79

print "--- Reading", UNICODE_DATA % ("-"+version), "..."

80

old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),

81

COMPOSITION_EXCLUSIONS % ("-"+version),

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

82

EASTASIAN_WIDTH % ("-"+version),

83

UNIHAN % ("-"+version))

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

84

print len(filter(None, old_unicode.table)), "characters"

85

merge_old_version(version, unicode, old_unicode)

86

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

87

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

88

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

89

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

90

91

# --------------------------------------------------------------------

92

# unicode character properties

93

94

def makeunicodedata(unicode, trace):

95

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

96

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

97

table = [dummy]

98

cache = {0: dummy}

99

index = [0] * len(unicode.chars)

100

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

101

FILE = "Modules/unicodedata_db.h"

102

103

print "--- Preparing", FILE, "..."

104

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

105

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

106

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

107

for char in unicode.chars:

108

record = unicode.table[char]

109

if record:

110

# extract database properties

111

category = CATEGORY_NAMES.index(record[2])

112

combining = int(record[3])

113

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

114

mirrored = record[9] == "Y"

Hye-Shik Chang

e9ddfbb

2004-08-04 07:38:35 +0000

[diff] [blame]

115

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

116

normalizationquickcheck = record[16]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

117

item = (

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

118

category, combining, bidirectional, mirrored, eastasianwidth,

119

normalizationquickcheck

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

120

)

121

# add entry to index and item tables

122

i = cache.get(item)

123

if i is None:

124

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

128

# 2) decomposition data

129

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

130

decomp_data = [0]

131

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

132

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

133

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

134

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

135

comp_pairs = []

136

comp_first = [None] * len(unicode.chars)

137

comp_last = [None] * len(unicode.chars)

138

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

139

for char in unicode.chars:

140

record = unicode.table[char]

141

if record:

142

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

143

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

144

if len(decomp) > 19:

145

raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

146

# prefix

147

if decomp[0][0] == "<":

148

prefix = decomp.pop(0)

149

else:

150

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

151

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

152

i = decomp_prefix.index(prefix)

153

except ValueError:

154

i = len(decomp_prefix)

155

decomp_prefix.append(prefix)

prefix = i

assert prefix < 256

# content

decomp = [prefix + (len(decomp)<<8)] +\

160

map(lambda s: int(s, 16), decomp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

161

# Collect NFC pairs

162

if not prefix and len(decomp) == 3 and \

163

char not in unicode.exclusions and \

164

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

169

try:

170

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

171

except ValueError:

172

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

173

decomp_data.extend(decomp)

174

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

175

else:

176

i = 0

177

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

178

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

179

f = l = 0

180

comp_first_ranges = []

181

comp_last_ranges = []

182

prev_f = prev_l = None

183

for i in unicode.chars:

184

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

190

prev_f = prev_f[0],i

191

else:

192

comp_first_ranges.append(prev_f)

193

prev_f = (i,i)

194

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

200

prev_l = prev_l[0],i

201

else:

202

comp_last_ranges.append(prev_l)

203

prev_l = (i,i)

204

comp_first_ranges.append(prev_f)

205

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

210

for f,l,char in comp_pairs:

211

f = comp_first[f]

212

l = comp_last[l]

213

comp_data[f*total_last+l] = char

214

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

215

print len(table), "unique properties"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

216

print len(decomp_prefix), "unique decomposition prefixes"

217

print len(decomp_data), "unique decomposition entries:",

218

print decomp_size, "bytes"

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

219

print total_first, "first characters in NFC"

220

print total_last, "last characters in NFC"

221

print len(comp_pairs), "NFC pairs"

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

222

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

223

print "--- Writing", FILE, "..."

224

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

225

fp = open(FILE, "w")

226

print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)

227

print >>fp

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

228

print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

229

print >>fp, "/* a list of unique database records */"

230

print >>fp, \

231

"const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

232

for item in table:

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

233

print >>fp, " {%d, %d, %d, %d, %d, %d}," % item

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

234

print >>fp, "};"

235

print >>fp

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

236

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

237

print >>fp, "/* Reindexing of NFC first characters. */"

238

print >>fp, "#define TOTAL_FIRST",total_first

239

print >>fp, "#define TOTAL_LAST",total_last

240

print >>fp, "struct reindex{int start;short count,index;};"

Martin v. Löwis

111c180

2008-06-13 07:47:47 +0000

[diff] [blame]

241

print >>fp, "static struct reindex nfc_first[] = {"

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

242

for start,end in comp_first_ranges:

243

print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])

244

print >>fp," {0,0,0}"

245

print >>fp,"};\n"

Martin v. Löwis

111c180

2008-06-13 07:47:47 +0000

[diff] [blame]

246

print >>fp, "static struct reindex nfc_last[] = {"

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

247

for start,end in comp_last_ranges:

248

print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])

249

print >>fp," {0,0,0}"

250

print >>fp,"};\n"

251

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

252

# FIXME: <fl> the following tables could be made static, and

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

253

# the support code moved into unicodedatabase.c

254

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

255

print >>fp, "/* string literals */"

256

print >>fp, "const char *_PyUnicode_CategoryNames[] = {"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

257

for name in CATEGORY_NAMES:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

258

print >>fp, " \"%s\"," % name

259

print >>fp, " NULL"

260

print >>fp, "};"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

261

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

262

print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

263

for name in BIDIRECTIONAL_NAMES:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

264

print >>fp, " \"%s\"," % name

265

print >>fp, " NULL"

266

print >>fp, "};"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

267

Hye-Shik Chang

e9ddfbb

2004-08-04 07:38:35 +0000

[diff] [blame]

268

print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"

269

for name in EASTASIANWIDTH_NAMES:

270

print >>fp, " \"%s\"," % name

print >>fp, " NULL"

print >>fp, "};"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

274

print >>fp, "static const char *decomp_prefix[] = {"

275

for name in decomp_prefix:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

276

print >>fp, " \"%s\"," % name

277

print >>fp, " NULL"

278

print >>fp, "};"

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

279

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

280

# split record index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

281

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

282

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

283

print >>fp, "/* index tables for the database records */"

284

print >>fp, "#define SHIFT", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

285

Array("index1", index1).dump(fp, trace)

286

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

287

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

288

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

289

index1, index2, shift = splitbins(decomp_index, trace)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

290

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

291

print >>fp, "/* decomposition data */"

292

Array("decomp_data", decomp_data).dump(fp, trace)

293

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

294

print >>fp, "/* index tables for the decomposition data */"

295

print >>fp, "#define DECOMP_SHIFT", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

296

Array("decomp_index1", index1).dump(fp, trace)

297

Array("decomp_index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

298

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

299

index, index2, shift = splitbins(comp_data, trace)

300

print >>fp, "/* NFC pairs */"

301

print >>fp, "#define COMP_SHIFT", shift

302

Array("comp_index", index).dump(fp, trace)

303

Array("comp_data", index2).dump(fp, trace)

304

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

305

# Generate delta tables for old versions

306

for version, table, normalization in unicode.changed:

307

cversion = version.replace(".","_")

308

records = [table[0]]

309

cache = {table[0]:0}

310

index = [0] * len(table)

311

for i, record in enumerate(table):

312

try:

313

index[i] = cache[record]

314

except KeyError:

315

index[i] = cache[record] = len(records)

316

records.append(record)

317

index1, index2, shift = splitbins(index, trace)

318

print >>fp, "static const change_record change_records_%s[] = {" % cversion

319

for record in records:

320

print >>fp, "\t{ %s }," % ", ".join(map(str,record))

321

print >>fp, "};"

322

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

323

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

324

print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion

325

print >>fp, "{"

326

print >>fp, "\tint index;"

327

print >>fp, "\tif (n >= 0x110000) index = 0;"

328

print >>fp, "\telse {"

329

print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)

330

print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \

331

(cversion, shift, ((1<<shift)-1))

332

print >>fp, "\t}"

333

print >>fp, "\treturn change_records_%s+index;" % cversion

334

print >>fp, "}\n"

335

print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion

336

print >>fp, "{"

337

print >>fp, "\tswitch(n) {"

338

for k, v in normalization:

339

print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)

340

print >>fp, "\tdefault: return 0;"

341

print >>fp, "\t}\n}\n"

342

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

343

fp.close()

344

345

# --------------------------------------------------------------------

346

# unicode character type tables

347

348

def makeunicodetype(unicode, trace):

349

350

FILE = "Objects/unicodetype_db.h"

351

352

print "--- Preparing", FILE, "..."

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

353

354

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

355

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

356

table = [dummy]

357

cache = {0: dummy}

358

index = [0] * len(unicode.chars)

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

359

numeric = {}

360

spaces = []

361

linebreaks = []

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

362

363

for char in unicode.chars:

364

record = unicode.table[char]

365

if record:

366

# extract database properties

367

category = record[2]

368

bidirectional = record[4]

369

flags = 0

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

370

delta = True

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

371

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

372

flags |= ALPHA_MASK

373

if category == "Ll":

374

flags |= LOWER_MASK

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

375

if category == "Zl" or bidirectional == "B":

376

flags |= LINEBREAK_MASK

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

377

linebreaks.append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

378

if category == "Zs" or bidirectional in ("WS", "B", "S"):

379

flags |= SPACE_MASK

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

380

spaces.append(char)

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

381

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

382

flags |= TITLE_MASK

383

if category == "Lu":

384

flags |= UPPER_MASK

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

385

# use delta predictor for upper/lower/title if it fits

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

386

if record[12]:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

387

upper = int(record[12], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

388

else:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

389

upper = char

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

390

if record[13]:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

391

lower = int(record[13], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

392

else:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

393

lower = char

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

394

if record[14]:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

395

title = int(record[14], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

396

else:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

397

# UCD.html says that a missing title char means that

398

# it defaults to the uppercase character, not to the

399

# character itself. Apparently, in the current UCD (5.x)

400

# this feature is never used

401

title = upper

402

upper_d = upper - char

403

lower_d = lower - char

404

title_d = title - char

405

if -32768 <= upper_d <= 32767 and \

406

-32768 <= lower_d <= 32767 and \

407

-32768 <= title_d <= 32767:

408

# use deltas

409

upper = upper_d & 0xffff

410

lower = lower_d & 0xffff

411

title = title_d & 0xffff

412

else:

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

413

flags |= NODELTA_MASK

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

414

# decimal digit, integer digit

415

decimal = 0

416

if record[6]:

417

flags |= DECIMAL_MASK

418

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

423

if record[8]:

424

flags |= NUMERIC_MASK

425

numeric.setdefault(record[8], []).append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

426

item = (

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

427

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

428

)

429

# add entry to index and item tables

430

i = cache.get(item)

431

if i is None:

432

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

436

print len(table), "unique character type entries"

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

437

print sum(map(len, numeric.values())), "numeric code points"

438

print len(spaces), "whitespace code points"

439

print len(linebreaks), "linebreak code points"

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

440

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

441

print "--- Writing", FILE, "..."

442

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

443

fp = open(FILE, "w")

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

444

print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)

445

print >>fp

446

print >>fp, "/* a list of unique character type descriptors */"

447

print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

448

for item in table:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

449

print >>fp, " {%d, %d, %d, %d, %d, %d}," % item

450

print >>fp, "};"

451

print >>fp

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

452

453

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

454

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

455

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

456

print >>fp, "/* type indexes */"

457

print >>fp, "#define SHIFT", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

458

Array("index1", index1).dump(fp, trace)

459

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

460

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

461

# Generate code for _PyUnicode_ToNumeric()

462

numeric_items = numeric.items()

463

numeric_items.sort()

464

print >>fp, '/* Returns the numeric value as double for Unicode characters'

465

print >>fp, ' * having this property, -1.0 otherwise.'

466

print >>fp, ' */'

467

print >>fp, 'double _PyUnicode_ToNumeric(Py_UNICODE ch)'

468

print >>fp, '{'

469

print >>fp, ' switch (ch) {'

470

for value, codepoints in numeric_items:

Amaury Forgeot d'Arc

5c92d43

2009-10-13 21:29:34 +0000

[diff] [blame^]

471

# Turn text into float literals

472

parts = value.split('/')

473

parts = [repr(float(part)) for part in parts]

474

value = '/'.join(parts)

475

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

haswide = False

hasnonewide = False

codepoints.sort()

for codepoint in codepoints:

480

if codepoint < 0x10000:

481

hasnonewide = True

482

if codepoint >= 0x10000 and not haswide:

483

print >>fp, '#ifdef Py_UNICODE_WIDE'

484

haswide = True

485

print >>fp, ' case 0x%04X:' % (codepoint,)

486

if haswide and hasnonewide:

487

print >>fp, '#endif'

488

print >>fp, ' return (double) %s;' % (value,)

489

if haswide and not hasnonewide:

490

print >>fp, '#endif'

491

print >>fp,' }'

492

print >>fp,' return -1.0;'

print >>fp,'}'

print >>fp

# Generate code for _PyUnicode_IsWhitespace()

497

print >>fp, "/* Returns 1 for Unicode characters having the bidirectional"

498

print >>fp, " * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise."

499

print >>fp, " */"

500

print >>fp, 'int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)'

501

print >>fp, '{'

502

print >>fp, '#ifdef WANT_WCTYPE_FUNCTIONS'

503

print >>fp, ' return iswspace(ch);'

504

print >>fp, '#else'

505

print >>fp, ' switch (ch) {'

haswide = False

hasnonewide = False

spaces.sort()

for codepoint in spaces:

511

if codepoint < 0x10000:

512

hasnonewide = True

513

if codepoint >= 0x10000 and not haswide:

514

print >>fp, '#ifdef Py_UNICODE_WIDE'

515

haswide = True

516

print >>fp, ' case 0x%04X:' % (codepoint,)

517

if haswide and hasnonewide:

518

print >>fp, '#endif'

519

print >>fp, ' return 1;'

520

if haswide and not hasnonewide:

print >>fp, '#endif'

print >>fp,' }'

print >>fp,' return 0;'

print >>fp, '#endif'

print >>fp,'}'

print >>fp

# Generate code for _PyUnicode_IsLinebreak()

530

print >>fp, "/* Returns 1 for Unicode characters having the category 'Zl',"

531

print >>fp, " * 'Zp' or type 'B', 0 otherwise."

532

print >>fp, " */"

533

print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)'

534

print >>fp, '{'

535

print >>fp, ' switch (ch) {'

haswide = False

hasnonewide = False

linebreaks.sort()

for codepoint in linebreaks:

540

if codepoint < 0x10000:

541

hasnonewide = True

542

if codepoint >= 0x10000 and not haswide:

543

print >>fp, '#ifdef Py_UNICODE_WIDE'

544

haswide = True

545

print >>fp, ' case 0x%04X:' % (codepoint,)

546

if haswide and hasnonewide:

547

print >>fp, '#endif'

548

print >>fp, ' return 1;'

549

if haswide and not hasnonewide:

print >>fp, '#endif'

print >>fp,' }'

print >>fp,' return 0;'

print >>fp,'}'

print >>fp

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

557

fp.close()

558

559

# --------------------------------------------------------------------

560

# unicode name database

561

562

def makeunicodename(unicode, trace):

563

564

FILE = "Modules/unicodename_db.h"

565

566

print "--- Preparing", FILE, "..."

567

568

# collect names

569

names = [None] * len(unicode.chars)

570

571

for char in unicode.chars:

572

record = unicode.table[char]

573

if record:

574

name = record[1].strip()

575

if name and name[0] != "<":

576

names[char] = name + chr(0)

577

578

print len(filter(lambda n: n is not None, names)), "distinct names"

579

580

# collect unique words from names (note that we differ between

581

# words inside a sentence, and words ending a sentence. the

582

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

598

599

print n, "words in text;", b, "bytes"

600

601

wordlist = words.items()

602

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

603

# sort on falling frequency, then by name

604

def cmpwords((aword, alist),(bword, blist)):

605

r = -cmp(len(alist),len(blist))

606

if r:

607

return r

608

return cmp(aword, bword)

609

wordlist.sort(cmpwords)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

610

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

611

# figure out how many phrasebook escapes we need

612

escapes = 0

613

while escapes * 256 < len(wordlist):

614

escapes = escapes + 1

615

print escapes, "escapes"

616

617

short = 256 - escapes

assert short > 0

print short, "short indexes in lexicon"

622

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

623

# statistics

624

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

625

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

626

n = n + len(wordlist[i][1])

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

627

print n, "short indexes in phrasebook"

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

628

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

629

# pick the most commonly used words, and sort the rest on falling

630

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

631

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

632

wordlist, wordtail = wordlist[:short], wordlist[short:]

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

633

wordtail.sort(lambda a, b: len(b[0])-len(a[0]))

634

wordlist.extend(wordtail)

635

636

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

643

offset = 0

644

for w, x in wordlist:

645

# encoding: bit 7 indicates last character in word (chr(128)

646

# indicates the last character in an entire string)

647

ww = w[:-1] + chr(ord(w[-1])+128)

648

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

649

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

650

if o < 0:

651

o = offset

652

lexicon = lexicon + ww

653

offset = offset + len(w)

654

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

655

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

656

657

lexicon = map(ord, lexicon)

658

659

# generate phrasebook from names and lexicon

660

phrasebook = [0]

661

phrasebook_offset = [0] * len(unicode.chars)

662

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

667

for w in w:

668

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

669

if i < short:

670

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

671

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

672

# store as two bytes

673

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

674

phrasebook.append(i&255)

675

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

676

assert getsize(phrasebook) == 1

677

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

678

#

679

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

684

record = unicode.table[char]

685

if record:

686

name = record[1].strip()

687

if name and name[0] != "<":

688

data.append((name, char))

689

690

# the magic number 47 was chosen to minimize the number of

691

# collisions on the current data set. if you like, change it

692

# and see what happens...

693

694

codehash = Hash("code", data, 47)

695

696

print "--- Writing", FILE, "..."

697

698

fp = open(FILE, "w")

699

print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)

700

print >>fp

701

print >>fp, "#define NAME_MAXLEN", 256

702

print >>fp

703

print >>fp, "/* lexicon */"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

704

Array("lexicon", lexicon).dump(fp, trace)

705

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

706

707

# split decomposition index table

708

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

709

710

print >>fp, "/* code->name phrasebook */"

711

print >>fp, "#define phrasebook_shift", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

712

print >>fp, "#define phrasebook_short", short

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

713

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

714

Array("phrasebook", phrasebook).dump(fp, trace)

715

Array("phrasebook_offset1", offset1).dump(fp, trace)

716

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

717

718

print >>fp, "/* name->code dictionary */"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

719

codehash.dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

fp.close()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

723

724

def merge_old_version(version, new, old):

725

# Changes to exclusion file not implemented yet

726

if old.exclusions != new.exclusions:

727

raise NotImplementedError, "exclusions differ"

728

729

# In these change records, 0xFF means "no change"

730

bidir_changes = [0xFF]*0x110000

731

category_changes = [0xFF]*0x110000

732

decimal_changes = [0xFF]*0x110000

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

733

mirrored_changes = [0xFF]*0x110000

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

734

# In numeric data, 0 means "no change",

735

# -1 means "did not have a numeric value

736

numeric_changes = [0] * 0x110000

737

# normalization_changes is a list of key-value pairs

738

normalization_changes = []

739

for i in range(0x110000):

740

if new.table[i] is None:

741

# Characters unassigned in the new version ought to

742

# be unassigned in the old one

743

assert old.table[i] is None

744

continue

745

# check characters unassigned in the old version

746

if old.table[i] is None:

747

# category 0 is "unassigned"

748

category_changes[i] = 0

749

continue

750

# check characters that differ

751

if old.table[i] != new.table[i]:

752

for k in range(len(old.table[i])):

753

if old.table[i][k] != new.table[i][k]:

754

value = old.table[i][k]

755

if k == 2:

756

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

757

category_changes[i] = CATEGORY_NAMES.index(value)

758

elif k == 4:

759

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

760

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

761

elif k == 5:

762

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

763

# We assume that all normalization changes are in 1:1 mappings

764

assert " " not in value

765

normalization_changes.append((i, value))

766

elif k == 6:

767

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

768

# we only support changes where the old value is a single digit

769

assert value in "0123456789"

770

decimal_changes[i] = int(value)

771

elif k == 8:

772

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

773

# Since 0 encodes "no change", the old value is better not 0

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

774

if not value:

775

numeric_changes[i] = -1

776

else:

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

777

numeric_changes[i] = float(value)

778

assert numeric_changes[i] not in (0, -1)

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

779

elif k == 9:

780

if value == 'Y':

781

mirrored_changes[i] = '1'

782

else:

783

mirrored_changes[i] = '0'

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

784

elif k == 11:

785

# change to ISO comment, ignore

786

pass

787

elif k == 12:

788

# change to simple uppercase mapping; ignore

789

pass

790

elif k == 13:

791

# change to simple lowercase mapping; ignore

792

pass

793

elif k == 14:

794

# change to simple titlecase mapping; ignore

795

pass

796

else:

797

class Difference(Exception):pass

798

raise Difference, (hex(i), k, old.table[i], new.table[i])

799

new.changed.append((version, zip(bidir_changes, category_changes,

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

800

decimal_changes, mirrored_changes,

801

numeric_changes),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

802

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

803

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

804

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

805

# --------------------------------------------------------------------

806

# the following support code is taken from the unidb utilities

807

808

809

# load a unicode-data file from disk

810

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

811

class UnicodeData:

812

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

813

def __init__(self, filename, exclusions, eastasianwidth, unihan,

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

814

derivednormalizationprops=None, expand=1):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

815

self.changed = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

816

file = open(filename)

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

817

table = [None] * 0x110000

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

while 1:

s = file.readline()

if not s:

break

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

822

s = s.strip().split(";")

823

char = int(s[0], 16)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

824

table[char] = s

825

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

826

# expand first-last ranges

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

827

if expand:

828

field = None

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

829

for i in range(0, 0x110000):

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

830

s = table[i]

831

if s:

832

if s[1][-6:] == "First>":

833

s[1] = ""

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

834

field = s

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

835

elif s[1][-5:] == "Last>":

836

s[1] = ""

837

field = None

838

elif field:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

839

f2 = field[:]

840

f2[0] = "%X" % i

841

table[i] = f2

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

842

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

843

# public attributes

844

self.filename = filename

845

self.table = table

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

846

self.chars = range(0x110000) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

847

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

848

file = open(exclusions)

self.exclusions = {}

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

char = int(s.split()[0],16)

857

self.exclusions[char] = 1

858

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

859

widths = [None] * 0x110000

860

for s in open(eastasianwidth):

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

s = s.split()[0].split(';')

867

if '..' in s[0]:

868

first, last = [int(c, 16) for c in s[0].split('..')]

869

chars = range(first, last+1)

870

else:

871

chars = [int(s[0], 16)]

872

for char in chars:

873

widths[char] = s[1]

874

for i in range(0, 0x110000):

875

if table[i] is not None:

876

table[i].append(widths[i])

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

877

if derivednormalizationprops:

878

quickchecks = [0] * 0x110000 # default is Yes

879

qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()

880

for s in open(derivednormalizationprops):

881

if '#' in s:

882

s = s[:s.index('#')]

883

s = [i.strip() for i in s.split(';')]

884

if len(s) < 2 or s[1] not in qc_order:

885

continue

886

quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No

887

quickcheck_shift = qc_order.index(s[1])*2

888

quickcheck <<= quickcheck_shift

889

if '..' not in s[0]:

890

first = last = int(s[0], 16)

891

else:

892

first, last = [int(c, 16) for c in s[0].split('..')]

893

for char in range(first, last+1):

894

assert not (quickchecks[char]>>quickcheck_shift)&3

895

quickchecks[char] |= quickcheck

896

for i in range(0, 0x110000):

897

if table[i] is not None:

898

table[i].append(quickchecks[i])

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

899

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

900

for line in open(unihan):

901

if not line.startswith('U+'):

902

continue

903

code, tag, value = line.split(None, 3)[:3]

904

if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',

905

'kOtherNumeric'):

906

continue

907

value = value.strip().replace(',', '')

908

i = int(code[2:], 16)

909

# Patch the numeric field

910

if table[i] is not None:

911

table[i][8] = value

912

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

913

def uselatin1(self):

914

# restrict character range to ISO Latin 1

915

self.chars = range(256)

916

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

917

# hash table tools

918

919

# this is a straight-forward reimplementation of Python's built-in

920

# dictionary type, using a static data structure, and a custom string

921

# hash algorithm.

922

923

def myhash(s, magic):

924

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

925

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

926

h = (h * magic) + c

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

927

ix = h & 0xff000000L

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

928

if ix:

929

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

return h

SIZES = [

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

934

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

935

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

936

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

]

class Hash:

def __init__(self, name, data, magic):

941

# turn a (key, value) list into a static hash table structure

942

943

# determine table size

944

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

raise AssertionError, "ran out of polynominals"

950

951

print size, "slots in hash table"

952

953

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

962

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

incr = (h ^ (h >> 3)) & mask;

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

print n, "collisions"

984

self.collisions = n

985

986

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

996

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

997

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

998

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

999

file.write("#define %s_magic %d\n" % (self.name, self.magic))

1000

file.write("#define %s_size %d\n" % (self.name, self.size))

1001

file.write("#define %s_poly %d\n" % (self.name, self.poly))

1002

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1003

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1011

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1012

# write data to file, as a C array

1013

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1014

if trace:

1015

print >>sys.stderr, self.name+":", size*len(self.data), "bytes"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1016

file.write("static ")

1017

if size == 1:

1018

file.write("unsigned char")

1019

elif size == 2:

1020

file.write("unsigned short")

1021

else:

1022

file.write("unsigned int")

1023

file.write(" " + self.name + "[] = {\n")

1024

if self.data:

1025

s = " "

1026

for item in self.data:

1027

i = str(item) + ", "

1028

if len(s) + len(i) > 78:

file.write(s + "\n")

s = " " + i

else:

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1033

if s.strip():

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

file.write(s + "\n")

file.write("};\n\n")

def getsize(data):

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1047

def splitbins(t, trace=0):

1048

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

1049

1050

t is a sequence of ints. This function can be useful to save space if

1051

many of the ints are the same. t1 and t2 are lists of ints, and shift

1052

is an int, chosen to minimize the combined size of t1 and t2 (in C

1053

code), and where for each i in range(len(t)),

1054

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1055

where mask is a bitmask isolating the last "shift" bits.

1056

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1057

If optional arg trace is non-zero (default zero), progress info

1058

is printed to sys.stderr. The higher the value, the more info

1059

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1060

"""

1061

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1062

if trace:

1063

def dump(t1, t2, shift, bytes):

1064

print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (

1065

len(t1), len(t2), shift, bytes)

1066

print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \

1067

"bytes"

1068

n = len(t)-1 # last valid index

1069

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

bytes = sys.maxint # smallest total size so far

1076

t = tuple(t) # so slices can be dict keys

1077

for shift in range(maxshift + 1):

1078

t1 = []

1079

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1080

size = 2**shift

1081

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1082

for i in range(0, len(t), size):

1083

bin = t[i:i+size]

1084

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1085

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1086

index = len(t2)

1087

bincache[bin] = index

1088

t2.extend(bin)

1089

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1090

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1091

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1092

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1093

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1094

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1095

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1096

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1097

t1, t2, shift = best

1098

if trace:

1099

print >>sys.stderr, "Best:",

1100

dump(t1, t2, shift, bytes)

1101

if __debug__:

1102

# exhaustively verify that the decomposition is correct

1103

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

1104

for i in xrange(len(t)):

1105

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1106

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1107

1108

if __name__ == "__main__":

Fredrik Lundh