Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython2

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

4

# this script converts a unicode 3.2 database file to

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

5

# Modules/unicodedata_db.h, Modules/unicodename_db.h,

6

# and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

7

#

8

# history:

9

# 2000-09-24 fl created (based on bits and pieces from unidb)

10

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

11

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

12

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

13

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

14

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

15

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

16

# 2002-09-11 wd use string methods

17

# 2002-10-18 mvl update to Unicode 3.2

18

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

19

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

20

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

21

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

22

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

23

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

24

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

#

import sys

SCRIPT = sys.argv[0]

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

30

VERSION = "2.6"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

31

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

32

# The Unicode Database

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

33

UNIDATA_VERSION = "5.1.0"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

34

UNICODE_DATA = "UnicodeData%s.txt"

35

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

36

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame^]

37

UNIHAN = "Unihan%s.txt"

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

38

DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

39

40

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

41

42

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

43

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

44

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

45

"So" ]

46

47

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

48

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

49

"ON" ]

50

Hye-Shik Chang

e9ddfbb

2004-08-04 07:38:35 +0000

[diff] [blame]

51

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

52

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

53

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

58

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

59

SPACE_MASK = 0x20

60

TITLE_MASK = 0x40

61

UPPER_MASK = 0x80

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

62

NODELTA_MASK = 0x100

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame^]

63

NUMERIC_MASK = 0x200

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

64

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

65

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

66

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

67

print "--- Reading", UNICODE_DATA % "", "..."

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

68

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

69

version = ""

70

unicode = UnicodeData(UNICODE_DATA % version,

71

COMPOSITION_EXCLUSIONS % version,

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

72

EASTASIAN_WIDTH % version,

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame^]

73

UNIHAN % version,

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

74

DERIVEDNORMALIZATION_PROPS % version)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

75

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

76

print len(filter(None, unicode.table)), "characters"

77

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

78

for version in old_versions:

79

print "--- Reading", UNICODE_DATA % ("-"+version), "..."

80

old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),

81

COMPOSITION_EXCLUSIONS % ("-"+version),

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame^]

82

EASTASIAN_WIDTH % ("-"+version),

83

UNIHAN % ("-"+version))

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

84

print len(filter(None, old_unicode.table)), "characters"

85

merge_old_version(version, unicode, old_unicode)

86

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

87

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

88

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

89

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

90

91

# --------------------------------------------------------------------

92

# unicode character properties

93

94

def makeunicodedata(unicode, trace):

95

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

96

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

97

table = [dummy]

98

cache = {0: dummy}

99

index = [0] * len(unicode.chars)

100

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

101

FILE = "Modules/unicodedata_db.h"

102

103

print "--- Preparing", FILE, "..."

104

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

105

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

106

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

107

for char in unicode.chars:

108

record = unicode.table[char]

109

if record:

110

# extract database properties

111

category = CATEGORY_NAMES.index(record[2])

112

combining = int(record[3])

113

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

114

mirrored = record[9] == "Y"

Hye-Shik Chang

e9ddfbb

2004-08-04 07:38:35 +0000

[diff] [blame]

115

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

116

normalizationquickcheck = record[16]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

117

item = (

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

118

category, combining, bidirectional, mirrored, eastasianwidth,

119

normalizationquickcheck

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

120

)

121

# add entry to index and item tables

122

i = cache.get(item)

123

if i is None:

124

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

128

# 2) decomposition data

129

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

130

decomp_data = [0]

131

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

132

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

133

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

134

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

135

comp_pairs = []

136

comp_first = [None] * len(unicode.chars)

137

comp_last = [None] * len(unicode.chars)

138

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

139

for char in unicode.chars:

140

record = unicode.table[char]

141

if record:

142

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

143

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

144

if len(decomp) > 19:

145

raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

146

# prefix

147

if decomp[0][0] == "<":

148

prefix = decomp.pop(0)

149

else:

150

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

151

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

152

i = decomp_prefix.index(prefix)

153

except ValueError:

154

i = len(decomp_prefix)

155

decomp_prefix.append(prefix)

prefix = i

assert prefix < 256

# content

decomp = [prefix + (len(decomp)<<8)] +\

160

map(lambda s: int(s, 16), decomp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

161

# Collect NFC pairs

162

if not prefix and len(decomp) == 3 and \

163

char not in unicode.exclusions and \

164

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

169

try:

170

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

171

except ValueError:

172

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

173

decomp_data.extend(decomp)

174

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

175

else:

176

i = 0

177

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

178

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

179

f = l = 0

180

comp_first_ranges = []

181

comp_last_ranges = []

182

prev_f = prev_l = None

183

for i in unicode.chars:

184

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

190

prev_f = prev_f[0],i

191

else:

192

comp_first_ranges.append(prev_f)

193

prev_f = (i,i)

194

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

200

prev_l = prev_l[0],i

201

else:

202

comp_last_ranges.append(prev_l)

203

prev_l = (i,i)

204

comp_first_ranges.append(prev_f)

205

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

210

for f,l,char in comp_pairs:

211

f = comp_first[f]

212

l = comp_last[l]

213

comp_data[f*total_last+l] = char

214

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

215

print len(table), "unique properties"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

216

print len(decomp_prefix), "unique decomposition prefixes"

217

print len(decomp_data), "unique decomposition entries:",

218

print decomp_size, "bytes"

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

219

print total_first, "first characters in NFC"

220

print total_last, "last characters in NFC"

221

print len(comp_pairs), "NFC pairs"

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

222

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

223

print "--- Writing", FILE, "..."

224

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

225

fp = open(FILE, "w")

226

print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)

227

print >>fp

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

228

print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

229

print >>fp, "/* a list of unique database records */"

230

print >>fp, \

231

"const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

232

for item in table:

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

233

print >>fp, " {%d, %d, %d, %d, %d, %d}," % item

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

234

print >>fp, "};"

235

print >>fp

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

236

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

237

print >>fp, "/* Reindexing of NFC first characters. */"

238

print >>fp, "#define TOTAL_FIRST",total_first

239

print >>fp, "#define TOTAL_LAST",total_last

240

print >>fp, "struct reindex{int start;short count,index;};"

Martin v. Löwis

111c180

2008-06-13 07:47:47 +0000

[diff] [blame]

241

print >>fp, "static struct reindex nfc_first[] = {"

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

242

for start,end in comp_first_ranges:

243

print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])

244

print >>fp," {0,0,0}"

245

print >>fp,"};\n"

Martin v. Löwis

111c180

2008-06-13 07:47:47 +0000

[diff] [blame]

246

print >>fp, "static struct reindex nfc_last[] = {"

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

247

for start,end in comp_last_ranges:

248

print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])

249

print >>fp," {0,0,0}"

250

print >>fp,"};\n"

251

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

252

# FIXME: <fl> the following tables could be made static, and

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

253

# the support code moved into unicodedatabase.c

254

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

255

print >>fp, "/* string literals */"

256

print >>fp, "const char *_PyUnicode_CategoryNames[] = {"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

257

for name in CATEGORY_NAMES:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

258

print >>fp, " \"%s\"," % name

259

print >>fp, " NULL"

260

print >>fp, "};"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

261

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

262

print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

263

for name in BIDIRECTIONAL_NAMES:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

264

print >>fp, " \"%s\"," % name

265

print >>fp, " NULL"

266

print >>fp, "};"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

267

Hye-Shik Chang

e9ddfbb

2004-08-04 07:38:35 +0000

[diff] [blame]

268

print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"

269

for name in EASTASIANWIDTH_NAMES:

270

print >>fp, " \"%s\"," % name

print >>fp, " NULL"

print >>fp, "};"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

274

print >>fp, "static const char *decomp_prefix[] = {"

275

for name in decomp_prefix:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

276

print >>fp, " \"%s\"," % name

277

print >>fp, " NULL"

278

print >>fp, "};"

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

279

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

280

# split record index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

281

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

282

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

283

print >>fp, "/* index tables for the database records */"

284

print >>fp, "#define SHIFT", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

285

Array("index1", index1).dump(fp, trace)

286

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

287

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

288

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

289

index1, index2, shift = splitbins(decomp_index, trace)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

290

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

291

print >>fp, "/* decomposition data */"

292

Array("decomp_data", decomp_data).dump(fp, trace)

293

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

294

print >>fp, "/* index tables for the decomposition data */"

295

print >>fp, "#define DECOMP_SHIFT", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

296

Array("decomp_index1", index1).dump(fp, trace)

297

Array("decomp_index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

298

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

299

index, index2, shift = splitbins(comp_data, trace)

300

print >>fp, "/* NFC pairs */"

301

print >>fp, "#define COMP_SHIFT", shift

302

Array("comp_index", index).dump(fp, trace)

303

Array("comp_data", index2).dump(fp, trace)

304

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

305

# Generate delta tables for old versions

306

for version, table, normalization in unicode.changed:

307

cversion = version.replace(".","_")

308

records = [table[0]]

309

cache = {table[0]:0}

310

index = [0] * len(table)

311

for i, record in enumerate(table):

312

try:

313

index[i] = cache[record]

314

except KeyError:

315

index[i] = cache[record] = len(records)

316

records.append(record)

317

index1, index2, shift = splitbins(index, trace)

318

print >>fp, "static const change_record change_records_%s[] = {" % cversion

319

for record in records:

320

print >>fp, "\t{ %s }," % ", ".join(map(str,record))

321

print >>fp, "};"

322

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

323

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

324

print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion

325

print >>fp, "{"

326

print >>fp, "\tint index;"

327

print >>fp, "\tif (n >= 0x110000) index = 0;"

328

print >>fp, "\telse {"

329

print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)

330

print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \

331

(cversion, shift, ((1<<shift)-1))

332

print >>fp, "\t}"

333

print >>fp, "\treturn change_records_%s+index;" % cversion

334

print >>fp, "}\n"

335

print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion

336

print >>fp, "{"

337

print >>fp, "\tswitch(n) {"

338

for k, v in normalization:

339

print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)

340

print >>fp, "\tdefault: return 0;"

341

print >>fp, "\t}\n}\n"

342

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

343

fp.close()

344

345

# --------------------------------------------------------------------

346

# unicode character type tables

347

348

def makeunicodetype(unicode, trace):

349

350

FILE = "Objects/unicodetype_db.h"

351

352

print "--- Preparing", FILE, "..."

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

353

354

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

355

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

356

table = [dummy]

357

cache = {0: dummy}

358

index = [0] * len(unicode.chars)

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame^]

359

numeric = {}

360

spaces = []

361

linebreaks = []

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

362

363

for char in unicode.chars:

364

record = unicode.table[char]

365

if record:

366

# extract database properties

367

category = record[2]

368

bidirectional = record[4]

369

flags = 0

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

370

delta = True

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

371

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

372

flags |= ALPHA_MASK

373

if category == "Ll":

374

flags |= LOWER_MASK

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

375

if category == "Zl" or bidirectional == "B":

376

flags |= LINEBREAK_MASK

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame^]

377

linebreaks.append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

378

if category == "Zs" or bidirectional in ("WS", "B", "S"):

379

flags |= SPACE_MASK

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame^]

380

spaces.append(char)

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

381

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

382

flags |= TITLE_MASK

383

if category == "Lu":

384

flags |= UPPER_MASK

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

385

# use delta predictor for upper/lower/title if it fits

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

386

if record[12]:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

387

upper = int(record[12], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

388

else:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

389

upper = char

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

390

if record[13]:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

391

lower = int(record[13], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

392

else:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

393

lower = char

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

394

if record[14]:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

395

title = int(record[14], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

396

else:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

397

# UCD.html says that a missing title char means that

398

# it defaults to the uppercase character, not to the

399

# character itself. Apparently, in the current UCD (5.x)

400

# this feature is never used

401

title = upper

402

upper_d = upper - char

403

lower_d = lower - char

404

title_d = title - char

405

if -32768 <= upper_d <= 32767 and \

406

-32768 <= lower_d <= 32767 and \

407

-32768 <= title_d <= 32767:

408

# use deltas

409

upper = upper_d & 0xffff

410

lower = lower_d & 0xffff

411

title = title_d & 0xffff

412

else:

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

413

flags |= NODELTA_MASK

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

414

# decimal digit, integer digit

415

decimal = 0

416

if record[6]:

417

flags |= DECIMAL_MASK

418

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame^]

423

if record[8]:

424

flags |= NUMERIC_MASK

425

numeric.setdefault(record[8], []).append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

426

item = (

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

427

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

428

)

429

# add entry to index and item tables

430

i = cache.get(item)

431

if i is None:

432

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

436

print len(table), "unique character type entries"

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame^]

437

print sum(map(len, numeric.values())), "numeric code points"

438

print len(spaces), "whitespace code points"

439

print len(linebreaks), "linebreak code points"

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

440

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

441

print "--- Writing", FILE, "..."

442

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

443

fp = open(FILE, "w")

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

444

print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)

445

print >>fp

446

print >>fp, "/* a list of unique character type descriptors */"

447

print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

448

for item in table:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

449

print >>fp, " {%d, %d, %d, %d, %d, %d}," % item

450

print >>fp, "};"

451

print >>fp

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

452

453

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

454

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

455

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

456

print >>fp, "/* type indexes */"

457

print >>fp, "#define SHIFT", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

458

Array("index1", index1).dump(fp, trace)

459

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

460

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame^]

461

# Generate code for _PyUnicode_ToNumeric()

462

numeric_items = numeric.items()

463

numeric_items.sort()

464

print >>fp, '/* Returns the numeric value as double for Unicode characters'

465

print >>fp, ' * having this property, -1.0 otherwise.'

466

print >>fp, ' */'

467

print >>fp, 'double _PyUnicode_ToNumeric(Py_UNICODE ch)'

468

print >>fp, '{'

469

print >>fp, ' switch (ch) {'

470

for value, codepoints in numeric_items:

haswide = False

hasnonewide = False

codepoints.sort()

for codepoint in codepoints:

475

if codepoint < 0x10000:

476

hasnonewide = True

477

if codepoint >= 0x10000 and not haswide:

478

print >>fp, '#ifdef Py_UNICODE_WIDE'

479

haswide = True

480

print >>fp, ' case 0x%04X:' % (codepoint,)

481

if haswide and hasnonewide:

482

print >>fp, '#endif'

483

print >>fp, ' return (double) %s;' % (value,)

484

if haswide and not hasnonewide:

485

print >>fp, '#endif'

486

print >>fp,' }'

487

print >>fp,' return -1.0;'

print >>fp,'}'

print >>fp

# Generate code for _PyUnicode_IsWhitespace()

492

print >>fp, "/* Returns 1 for Unicode characters having the bidirectional"

493

print >>fp, " * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise."

494

print >>fp, " */"

495

print >>fp, 'int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)'

496

print >>fp, '{'

497

print >>fp, '#ifdef WANT_WCTYPE_FUNCTIONS'

498

print >>fp, ' return iswspace(ch);'

499

print >>fp, '#else'

500

print >>fp, ' switch (ch) {'

haswide = False

hasnonewide = False

spaces.sort()

for codepoint in spaces:

506

if codepoint < 0x10000:

507

hasnonewide = True

508

if codepoint >= 0x10000 and not haswide:

509

print >>fp, '#ifdef Py_UNICODE_WIDE'

510

haswide = True

511

print >>fp, ' case 0x%04X:' % (codepoint,)

512

if haswide and hasnonewide:

513

print >>fp, '#endif'

514

print >>fp, ' return 1;'

515

if haswide and not hasnonewide:

print >>fp, '#endif'

print >>fp,' }'

print >>fp,' return 0;'

print >>fp, '#endif'

print >>fp,'}'

print >>fp

# Generate code for _PyUnicode_IsLinebreak()

525

print >>fp, "/* Returns 1 for Unicode characters having the category 'Zl',"

526

print >>fp, " * 'Zp' or type 'B', 0 otherwise."

527

print >>fp, " */"

528

print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)'

529

print >>fp, '{'

530

print >>fp, ' switch (ch) {'

haswide = False

hasnonewide = False

linebreaks.sort()

for codepoint in linebreaks:

535

if codepoint < 0x10000:

536

hasnonewide = True

537

if codepoint >= 0x10000 and not haswide:

538

print >>fp, '#ifdef Py_UNICODE_WIDE'

539

haswide = True

540

print >>fp, ' case 0x%04X:' % (codepoint,)

541

if haswide and hasnonewide:

542

print >>fp, '#endif'

543

print >>fp, ' return 1;'

544

if haswide and not hasnonewide:

print >>fp, '#endif'

print >>fp,' }'

print >>fp,' return 0;'

print >>fp,'}'

print >>fp

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

552

fp.close()

553

554

# --------------------------------------------------------------------

555

# unicode name database

556

557

def makeunicodename(unicode, trace):

558

559

FILE = "Modules/unicodename_db.h"

560

561

print "--- Preparing", FILE, "..."

562

563

# collect names

564

names = [None] * len(unicode.chars)

565

566

for char in unicode.chars:

567

record = unicode.table[char]

568

if record:

569

name = record[1].strip()

570

if name and name[0] != "<":

571

names[char] = name + chr(0)

572

573

print len(filter(lambda n: n is not None, names)), "distinct names"

574

575

# collect unique words from names (note that we differ between

576

# words inside a sentence, and words ending a sentence. the

577

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

593

594

print n, "words in text;", b, "bytes"

595

596

wordlist = words.items()

597

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

598

# sort on falling frequency, then by name

599

def cmpwords((aword, alist),(bword, blist)):

600

r = -cmp(len(alist),len(blist))

601

if r:

602

return r

603

return cmp(aword, bword)

604

wordlist.sort(cmpwords)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

605

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

606

# figure out how many phrasebook escapes we need

607

escapes = 0

608

while escapes * 256 < len(wordlist):

609

escapes = escapes + 1

610

print escapes, "escapes"

611

612

short = 256 - escapes

assert short > 0

print short, "short indexes in lexicon"

617

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

618

# statistics

619

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

620

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

621

n = n + len(wordlist[i][1])

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

622

print n, "short indexes in phrasebook"

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

623

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

624

# pick the most commonly used words, and sort the rest on falling

625

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

626

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

627

wordlist, wordtail = wordlist[:short], wordlist[short:]

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

628

wordtail.sort(lambda a, b: len(b[0])-len(a[0]))

629

wordlist.extend(wordtail)

630

631

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

638

offset = 0

639

for w, x in wordlist:

640

# encoding: bit 7 indicates last character in word (chr(128)

641

# indicates the last character in an entire string)

642

ww = w[:-1] + chr(ord(w[-1])+128)

643

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

644

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

645

if o < 0:

646

o = offset

647

lexicon = lexicon + ww

648

offset = offset + len(w)

649

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

650

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

651

652

lexicon = map(ord, lexicon)

653

654

# generate phrasebook from names and lexicon

655

phrasebook = [0]

656

phrasebook_offset = [0] * len(unicode.chars)

657

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

662

for w in w:

663

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

664

if i < short:

665

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

666

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

667

# store as two bytes

668

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

669

phrasebook.append(i&255)

670

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

671

assert getsize(phrasebook) == 1

672

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

673

#

674

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

679

record = unicode.table[char]

680

if record:

681

name = record[1].strip()

682

if name and name[0] != "<":

683

data.append((name, char))

684

685

# the magic number 47 was chosen to minimize the number of

686

# collisions on the current data set. if you like, change it

687

# and see what happens...

688

689

codehash = Hash("code", data, 47)

690

691

print "--- Writing", FILE, "..."

692

693

fp = open(FILE, "w")

694

print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)

695

print >>fp

696

print >>fp, "#define NAME_MAXLEN", 256

697

print >>fp

698

print >>fp, "/* lexicon */"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

699

Array("lexicon", lexicon).dump(fp, trace)

700

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

701

702

# split decomposition index table

703

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

704

705

print >>fp, "/* code->name phrasebook */"

706

print >>fp, "#define phrasebook_shift", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

707

print >>fp, "#define phrasebook_short", short

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

708

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

709

Array("phrasebook", phrasebook).dump(fp, trace)

710

Array("phrasebook_offset1", offset1).dump(fp, trace)

711

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

712

713

print >>fp, "/* name->code dictionary */"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

714

codehash.dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

fp.close()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

718

719

def merge_old_version(version, new, old):

720

# Changes to exclusion file not implemented yet

721

if old.exclusions != new.exclusions:

722

raise NotImplementedError, "exclusions differ"

723

724

# In these change records, 0xFF means "no change"

725

bidir_changes = [0xFF]*0x110000

726

category_changes = [0xFF]*0x110000

727

decimal_changes = [0xFF]*0x110000

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

728

mirrored_changes = [0xFF]*0x110000

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

729

# In numeric data, 0 means "no change",

730

# -1 means "did not have a numeric value

731

numeric_changes = [0] * 0x110000

732

# normalization_changes is a list of key-value pairs

733

normalization_changes = []

734

for i in range(0x110000):

735

if new.table[i] is None:

736

# Characters unassigned in the new version ought to

737

# be unassigned in the old one

738

assert old.table[i] is None

739

continue

740

# check characters unassigned in the old version

741

if old.table[i] is None:

742

# category 0 is "unassigned"

743

category_changes[i] = 0

744

continue

745

# check characters that differ

746

if old.table[i] != new.table[i]:

747

for k in range(len(old.table[i])):

748

if old.table[i][k] != new.table[i][k]:

749

value = old.table[i][k]

750

if k == 2:

751

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

752

category_changes[i] = CATEGORY_NAMES.index(value)

753

elif k == 4:

754

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

755

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

756

elif k == 5:

757

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

758

# We assume that all normalization changes are in 1:1 mappings

759

assert " " not in value

760

normalization_changes.append((i, value))

761

elif k == 6:

762

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

763

# we only support changes where the old value is a single digit

764

assert value in "0123456789"

765

decimal_changes[i] = int(value)

766

elif k == 8:

767

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

768

# Since 0 encodes "no change", the old value is better not 0

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

769

if not value:

770

numeric_changes[i] = -1

771

else:

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame^]

772

numeric_changes[i] = float(value)

773

assert numeric_changes[i] not in (0, -1)

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

774

elif k == 9:

775

if value == 'Y':

776

mirrored_changes[i] = '1'

777

else:

778

mirrored_changes[i] = '0'

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

779

elif k == 11:

780

# change to ISO comment, ignore

781

pass

782

elif k == 12:

783

# change to simple uppercase mapping; ignore

784

pass

785

elif k == 13:

786

# change to simple lowercase mapping; ignore

787

pass

788

elif k == 14:

789

# change to simple titlecase mapping; ignore

790

pass

791

else:

792

class Difference(Exception):pass

793

raise Difference, (hex(i), k, old.table[i], new.table[i])

794

new.changed.append((version, zip(bidir_changes, category_changes,

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

795

decimal_changes, mirrored_changes,

796

numeric_changes),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

797

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

798

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

799

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

800

# --------------------------------------------------------------------

801

# the following support code is taken from the unidb utilities

802

803

804

# load a unicode-data file from disk

805

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

806

class UnicodeData:

807

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame^]

808

def __init__(self, filename, exclusions, eastasianwidth, unihan,

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

809

derivednormalizationprops=None, expand=1):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

810

self.changed = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

811

file = open(filename)

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

812

table = [None] * 0x110000

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

while 1:

s = file.readline()

if not s:

break

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

817

s = s.strip().split(";")

818

char = int(s[0], 16)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

819

table[char] = s

820

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

821

# expand first-last ranges

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

822

if expand:

823

field = None

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

824

for i in range(0, 0x110000):

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

825

s = table[i]

826

if s:

827

if s[1][-6:] == "First>":

828

s[1] = ""

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

829

field = s

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

830

elif s[1][-5:] == "Last>":

831

s[1] = ""

832

field = None

833

elif field:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

834

f2 = field[:]

835

f2[0] = "%X" % i

836

table[i] = f2

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

837

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

838

# public attributes

839

self.filename = filename

840

self.table = table

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

841

self.chars = range(0x110000) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

842

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

843

file = open(exclusions)

self.exclusions = {}

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

char = int(s.split()[0],16)

852

self.exclusions[char] = 1

853

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

854

widths = [None] * 0x110000

855

for s in open(eastasianwidth):

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

s = s.split()[0].split(';')

862

if '..' in s[0]:

863

first, last = [int(c, 16) for c in s[0].split('..')]

864

chars = range(first, last+1)

865

else:

866

chars = [int(s[0], 16)]

867

for char in chars:

868

widths[char] = s[1]

869

for i in range(0, 0x110000):

870

if table[i] is not None:

871

table[i].append(widths[i])

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

872

if derivednormalizationprops:

873

quickchecks = [0] * 0x110000 # default is Yes

874

qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()

875

for s in open(derivednormalizationprops):

876

if '#' in s:

877

s = s[:s.index('#')]

878

s = [i.strip() for i in s.split(';')]

879

if len(s) < 2 or s[1] not in qc_order:

880

continue

881

quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No

882

quickcheck_shift = qc_order.index(s[1])*2

883

quickcheck <<= quickcheck_shift

884

if '..' not in s[0]:

885

first = last = int(s[0], 16)

886

else:

887

first, last = [int(c, 16) for c in s[0].split('..')]

888

for char in range(first, last+1):

889

assert not (quickchecks[char]>>quickcheck_shift)&3

890

quickchecks[char] |= quickcheck

891

for i in range(0, 0x110000):

892

if table[i] is not None:

893

table[i].append(quickchecks[i])

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

894

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame^]

895

for line in open(unihan):

896

if not line.startswith('U+'):

897

continue

898

code, tag, value = line.split(None, 3)[:3]

899

if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',

900

'kOtherNumeric'):

901

continue

902

value = value.strip().replace(',', '')

903

i = int(code[2:], 16)

904

# Patch the numeric field

905

if table[i] is not None:

906

table[i][8] = value

907

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

908

def uselatin1(self):

909

# restrict character range to ISO Latin 1

910

self.chars = range(256)

911

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

912

# hash table tools

913

914

# this is a straight-forward reimplementation of Python's built-in

915

# dictionary type, using a static data structure, and a custom string

916

# hash algorithm.

917

918

def myhash(s, magic):

919

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

920

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

921

h = (h * magic) + c

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

922

ix = h & 0xff000000L

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

923

if ix:

924

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

return h

SIZES = [

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

929

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

930

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

931

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

]

class Hash:

def __init__(self, name, data, magic):

936

# turn a (key, value) list into a static hash table structure

937

938

# determine table size

939

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

raise AssertionError, "ran out of polynominals"

945

946

print size, "slots in hash table"

947

948

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

957

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

incr = (h ^ (h >> 3)) & mask;

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

print n, "collisions"

979

self.collisions = n

980

981

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

991

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

992

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

993

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

994

file.write("#define %s_magic %d\n" % (self.name, self.magic))

995

file.write("#define %s_size %d\n" % (self.name, self.size))

996

file.write("#define %s_poly %d\n" % (self.name, self.poly))

997

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

998

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1006

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1007

# write data to file, as a C array

1008

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1009

if trace:

1010

print >>sys.stderr, self.name+":", size*len(self.data), "bytes"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1011

file.write("static ")

1012

if size == 1:

1013

file.write("unsigned char")

1014

elif size == 2:

1015

file.write("unsigned short")

1016

else:

1017

file.write("unsigned int")

1018

file.write(" " + self.name + "[] = {\n")

1019

if self.data:

1020

s = " "

1021

for item in self.data:

1022

i = str(item) + ", "

1023

if len(s) + len(i) > 78:

file.write(s + "\n")

s = " " + i

else:

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1028

if s.strip():

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

file.write(s + "\n")

file.write("};\n\n")

def getsize(data):

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1042

def splitbins(t, trace=0):

1043

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

1044

1045

t is a sequence of ints. This function can be useful to save space if

1046

many of the ints are the same. t1 and t2 are lists of ints, and shift

1047

is an int, chosen to minimize the combined size of t1 and t2 (in C

1048

code), and where for each i in range(len(t)),

1049

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1050

where mask is a bitmask isolating the last "shift" bits.

1051

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1052

If optional arg trace is non-zero (default zero), progress info

1053

is printed to sys.stderr. The higher the value, the more info

1054

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1055

"""

1056

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1057

if trace:

1058

def dump(t1, t2, shift, bytes):

1059

print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (

1060

len(t1), len(t2), shift, bytes)

1061

print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \

1062

"bytes"

1063

n = len(t)-1 # last valid index

1064

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

bytes = sys.maxint # smallest total size so far

1071

t = tuple(t) # so slices can be dict keys

1072

for shift in range(maxshift + 1):

1073

t1 = []

1074

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1075

size = 2**shift

1076

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1077

for i in range(0, len(t), size):

1078

bin = t[i:i+size]

1079

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1080

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1081

index = len(t2)

1082

bincache[bin] = index

1083

t2.extend(bin)

1084

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1085

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1086

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1087

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1088

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1089

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1090

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1091

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1092

t1, t2, shift = best

1093

if trace:

1094

print >>sys.stderr, "Best:",

1095

dump(t1, t2, shift, bytes)

1096

if __debug__:

1097

# exhaustively verify that the decomposition is correct

1098

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

1099

for i in xrange(len(t)):

1100

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1101

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1102

1103

if __name__ == "__main__":

Fredrik Lundh