Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython2

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

4

# this script converts a unicode 3.2 database file to

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

5

# Modules/unicodedata_db.h, Modules/unicodename_db.h,

6

# and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

7

#

8

# history:

9

# 2000-09-24 fl created (based on bits and pieces from unidb)

10

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

11

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

12

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

13

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

14

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

15

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

16

# 2002-09-11 wd use string methods

17

# 2002-10-18 mvl update to Unicode 3.2

18

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

19

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

20

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

21

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

22

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

23

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

24

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

#

import sys

SCRIPT = sys.argv[0]

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

30

VERSION = "2.6"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

31

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

32

# The Unicode Database

Florent Xicluna

2e0a53f

2010-03-18 21:50:06 +0000

[diff] [blame^]

33

UNIDATA_VERSION = "5.2.0"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

34

UNICODE_DATA = "UnicodeData%s.txt"

35

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

36

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

37

UNIHAN = "Unihan%s.txt"

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

38

DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

39

40

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

41

42

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

43

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

44

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

45

"So" ]

46

47

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

48

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

49

"ON" ]

50

Hye-Shik Chang

e9ddfbb

2004-08-04 07:38:35 +0000

[diff] [blame]

51

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

52

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

53

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

58

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

59

SPACE_MASK = 0x20

60

TITLE_MASK = 0x40

61

UPPER_MASK = 0x80

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

62

NODELTA_MASK = 0x100

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

63

NUMERIC_MASK = 0x200

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

64

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

65

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

66

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

67

print "--- Reading", UNICODE_DATA % "", "..."

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

68

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

69

version = ""

70

unicode = UnicodeData(UNICODE_DATA % version,

71

COMPOSITION_EXCLUSIONS % version,

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

72

EASTASIAN_WIDTH % version,

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

73

UNIHAN % version,

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

74

DERIVEDNORMALIZATION_PROPS % version)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

75

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

76

print len(filter(None, unicode.table)), "characters"

77

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

78

for version in old_versions:

79

print "--- Reading", UNICODE_DATA % ("-"+version), "..."

80

old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),

81

COMPOSITION_EXCLUSIONS % ("-"+version),

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

82

EASTASIAN_WIDTH % ("-"+version),

83

UNIHAN % ("-"+version))

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

84

print len(filter(None, old_unicode.table)), "characters"

85

merge_old_version(version, unicode, old_unicode)

86

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

87

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

88

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

89

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

90

91

# --------------------------------------------------------------------

92

# unicode character properties

93

94

def makeunicodedata(unicode, trace):

95

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

96

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

97

table = [dummy]

98

cache = {0: dummy}

99

index = [0] * len(unicode.chars)

100

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

101

FILE = "Modules/unicodedata_db.h"

102

103

print "--- Preparing", FILE, "..."

104

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

105

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

106

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

107

for char in unicode.chars:

108

record = unicode.table[char]

109

if record:

110

# extract database properties

111

category = CATEGORY_NAMES.index(record[2])

112

combining = int(record[3])

113

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

114

mirrored = record[9] == "Y"

Hye-Shik Chang

e9ddfbb

2004-08-04 07:38:35 +0000

[diff] [blame]

115

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

116

normalizationquickcheck = record[16]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

117

item = (

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

118

category, combining, bidirectional, mirrored, eastasianwidth,

119

normalizationquickcheck

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

120

)

121

# add entry to index and item tables

122

i = cache.get(item)

123

if i is None:

124

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

128

# 2) decomposition data

129

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

130

decomp_data = [0]

131

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

132

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

133

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

134

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

135

comp_pairs = []

136

comp_first = [None] * len(unicode.chars)

137

comp_last = [None] * len(unicode.chars)

138

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

139

for char in unicode.chars:

140

record = unicode.table[char]

141

if record:

142

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

143

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

144

if len(decomp) > 19:

145

raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

146

# prefix

147

if decomp[0][0] == "<":

148

prefix = decomp.pop(0)

149

else:

150

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

151

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

152

i = decomp_prefix.index(prefix)

153

except ValueError:

154

i = len(decomp_prefix)

155

decomp_prefix.append(prefix)

156

prefix = i

157

assert prefix < 256

158

# content

Florent Xicluna

2010-03-15 14:00:58 +0000

[diff] [blame]

159

decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

160

# Collect NFC pairs

161

if not prefix and len(decomp) == 3 and \

162

char not in unicode.exclusions and \

163

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

168

try:

169

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

170

except ValueError:

171

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

172

decomp_data.extend(decomp)

173

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

174

else:

175

i = 0

176

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

177

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

178

f = l = 0

179

comp_first_ranges = []

180

comp_last_ranges = []

181

prev_f = prev_l = None

182

for i in unicode.chars:

183

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

189

prev_f = prev_f[0],i

190

else:

191

comp_first_ranges.append(prev_f)

192

prev_f = (i,i)

193

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

199

prev_l = prev_l[0],i

200

else:

201

comp_last_ranges.append(prev_l)

202

prev_l = (i,i)

203

comp_first_ranges.append(prev_f)

204

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

209

for f,l,char in comp_pairs:

210

f = comp_first[f]

211

l = comp_last[l]

212

comp_data[f*total_last+l] = char

213

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

214

print len(table), "unique properties"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

215

print len(decomp_prefix), "unique decomposition prefixes"

216

print len(decomp_data), "unique decomposition entries:",

217

print decomp_size, "bytes"

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

218

print total_first, "first characters in NFC"

219

print total_last, "last characters in NFC"

220

print len(comp_pairs), "NFC pairs"

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

221

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

222

print "--- Writing", FILE, "..."

223

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

224

fp = open(FILE, "w")

225

print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)

226

print >>fp

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

227

print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

228

print >>fp, "/* a list of unique database records */"

229

print >>fp, \

230

"const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

231

for item in table:

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

232

print >>fp, " {%d, %d, %d, %d, %d, %d}," % item

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

233

print >>fp, "};"

234

print >>fp

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

235

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

236

print >>fp, "/* Reindexing of NFC first characters. */"

237

print >>fp, "#define TOTAL_FIRST",total_first

238

print >>fp, "#define TOTAL_LAST",total_last

239

print >>fp, "struct reindex{int start;short count,index;};"

Martin v. Löwis

111c180

2008-06-13 07:47:47 +0000

[diff] [blame]

240

print >>fp, "static struct reindex nfc_first[] = {"

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

241

for start,end in comp_first_ranges:

242

print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])

243

print >>fp," {0,0,0}"

244

print >>fp,"};\n"

Martin v. Löwis

111c180

2008-06-13 07:47:47 +0000

[diff] [blame]

245

print >>fp, "static struct reindex nfc_last[] = {"

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

246

for start,end in comp_last_ranges:

247

print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])

248

print >>fp," {0,0,0}"

249

print >>fp,"};\n"

250

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

251

# FIXME: <fl> the following tables could be made static, and

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

252

# the support code moved into unicodedatabase.c

253

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

254

print >>fp, "/* string literals */"

255

print >>fp, "const char *_PyUnicode_CategoryNames[] = {"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

256

for name in CATEGORY_NAMES:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

257

print >>fp, " \"%s\"," % name

258

print >>fp, " NULL"

259

print >>fp, "};"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

260

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

261

print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

262

for name in BIDIRECTIONAL_NAMES:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

263

print >>fp, " \"%s\"," % name

264

print >>fp, " NULL"

265

print >>fp, "};"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

266

Hye-Shik Chang

e9ddfbb

2004-08-04 07:38:35 +0000

[diff] [blame]

267

print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"

268

for name in EASTASIANWIDTH_NAMES:

269

print >>fp, " \"%s\"," % name

print >>fp, " NULL"

print >>fp, "};"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

273

print >>fp, "static const char *decomp_prefix[] = {"

274

for name in decomp_prefix:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

275

print >>fp, " \"%s\"," % name

276

print >>fp, " NULL"

277

print >>fp, "};"

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

278

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

279

# split record index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

280

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

281

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

282

print >>fp, "/* index tables for the database records */"

283

print >>fp, "#define SHIFT", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

284

Array("index1", index1).dump(fp, trace)

285

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

286

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

287

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

288

index1, index2, shift = splitbins(decomp_index, trace)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

289

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

290

print >>fp, "/* decomposition data */"

291

Array("decomp_data", decomp_data).dump(fp, trace)

292

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

293

print >>fp, "/* index tables for the decomposition data */"

294

print >>fp, "#define DECOMP_SHIFT", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

295

Array("decomp_index1", index1).dump(fp, trace)

296

Array("decomp_index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

297

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

298

index, index2, shift = splitbins(comp_data, trace)

299

print >>fp, "/* NFC pairs */"

300

print >>fp, "#define COMP_SHIFT", shift

301

Array("comp_index", index).dump(fp, trace)

302

Array("comp_data", index2).dump(fp, trace)

303

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

304

# Generate delta tables for old versions

305

for version, table, normalization in unicode.changed:

306

cversion = version.replace(".","_")

307

records = [table[0]]

308

cache = {table[0]:0}

309

index = [0] * len(table)

310

for i, record in enumerate(table):

311

try:

312

index[i] = cache[record]

313

except KeyError:

314

index[i] = cache[record] = len(records)

315

records.append(record)

316

index1, index2, shift = splitbins(index, trace)

317

print >>fp, "static const change_record change_records_%s[] = {" % cversion

318

for record in records:

319

print >>fp, "\t{ %s }," % ", ".join(map(str,record))

320

print >>fp, "};"

321

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

322

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

323

print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion

324

print >>fp, "{"

325

print >>fp, "\tint index;"

326

print >>fp, "\tif (n >= 0x110000) index = 0;"

327

print >>fp, "\telse {"

328

print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)

329

print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \

330

(cversion, shift, ((1<<shift)-1))

331

print >>fp, "\t}"

332

print >>fp, "\treturn change_records_%s+index;" % cversion

333

print >>fp, "}\n"

334

print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion

335

print >>fp, "{"

336

print >>fp, "\tswitch(n) {"

337

for k, v in normalization:

338

print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)

339

print >>fp, "\tdefault: return 0;"

340

print >>fp, "\t}\n}\n"

341

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

342

fp.close()

343

344

# --------------------------------------------------------------------

345

# unicode character type tables

346

347

def makeunicodetype(unicode, trace):

348

349

FILE = "Objects/unicodetype_db.h"

350

351

print "--- Preparing", FILE, "..."

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

352

353

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

354

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

355

table = [dummy]

356

cache = {0: dummy}

357

index = [0] * len(unicode.chars)

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

358

numeric = {}

359

spaces = []

360

linebreaks = []

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

361

362

for char in unicode.chars:

363

record = unicode.table[char]

364

if record:

365

# extract database properties

366

category = record[2]

367

bidirectional = record[4]

368

flags = 0

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

369

delta = True

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

370

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

371

flags |= ALPHA_MASK

372

if category == "Ll":

373

flags |= LOWER_MASK

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

374

if category == "Zl" or bidirectional == "B":

375

flags |= LINEBREAK_MASK

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

376

linebreaks.append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

377

if category == "Zs" or bidirectional in ("WS", "B", "S"):

378

flags |= SPACE_MASK

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

379

spaces.append(char)

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

380

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

381

flags |= TITLE_MASK

382

if category == "Lu":

383

flags |= UPPER_MASK

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

384

# use delta predictor for upper/lower/title if it fits

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

385

if record[12]:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

386

upper = int(record[12], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

387

else:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

388

upper = char

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

389

if record[13]:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

390

lower = int(record[13], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

391

else:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

392

lower = char

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

393

if record[14]:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

394

title = int(record[14], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

395

else:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

396

# UCD.html says that a missing title char means that

397

# it defaults to the uppercase character, not to the

398

# character itself. Apparently, in the current UCD (5.x)

399

# this feature is never used

400

title = upper

401

upper_d = upper - char

402

lower_d = lower - char

403

title_d = title - char

404

if -32768 <= upper_d <= 32767 and \

405

-32768 <= lower_d <= 32767 and \

406

-32768 <= title_d <= 32767:

407

# use deltas

408

upper = upper_d & 0xffff

409

lower = lower_d & 0xffff

410

title = title_d & 0xffff

411

else:

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

412

flags |= NODELTA_MASK

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

413

# decimal digit, integer digit

414

decimal = 0

415

if record[6]:

416

flags |= DECIMAL_MASK

417

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

422

if record[8]:

423

flags |= NUMERIC_MASK

424

numeric.setdefault(record[8], []).append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

425

item = (

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

426

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

427

)

428

# add entry to index and item tables

429

i = cache.get(item)

430

if i is None:

431

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

435

print len(table), "unique character type entries"

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

436

print sum(map(len, numeric.values())), "numeric code points"

437

print len(spaces), "whitespace code points"

438

print len(linebreaks), "linebreak code points"

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

439

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

440

print "--- Writing", FILE, "..."

441

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

442

fp = open(FILE, "w")

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

443

print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)

444

print >>fp

445

print >>fp, "/* a list of unique character type descriptors */"

446

print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

447

for item in table:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

448

print >>fp, " {%d, %d, %d, %d, %d, %d}," % item

449

print >>fp, "};"

450

print >>fp

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

451

452

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

453

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

454

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

455

print >>fp, "/* type indexes */"

456

print >>fp, "#define SHIFT", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

457

Array("index1", index1).dump(fp, trace)

458

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

459

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

460

# Generate code for _PyUnicode_ToNumeric()

Florent Xicluna

2010-03-15 14:00:58 +0000

[diff] [blame]

461

numeric_items = sorted(numeric.items())

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

462

print >>fp, '/* Returns the numeric value as double for Unicode characters'

463

print >>fp, ' * having this property, -1.0 otherwise.'

464

print >>fp, ' */'

465

print >>fp, 'double _PyUnicode_ToNumeric(Py_UNICODE ch)'

466

print >>fp, '{'

467

print >>fp, ' switch (ch) {'

468

for value, codepoints in numeric_items:

Amaury Forgeot d'Arc

5c92d43

2009-10-13 21:29:34 +0000

[diff] [blame]

469

# Turn text into float literals

470

parts = value.split('/')

471

parts = [repr(float(part)) for part in parts]

472

value = '/'.join(parts)

473

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

haswide = False

hasnonewide = False

codepoints.sort()

for codepoint in codepoints:

478

if codepoint < 0x10000:

479

hasnonewide = True

480

if codepoint >= 0x10000 and not haswide:

481

print >>fp, '#ifdef Py_UNICODE_WIDE'

482

haswide = True

483

print >>fp, ' case 0x%04X:' % (codepoint,)

484

if haswide and hasnonewide:

485

print >>fp, '#endif'

486

print >>fp, ' return (double) %s;' % (value,)

487

if haswide and not hasnonewide:

488

print >>fp, '#endif'

489

print >>fp,' }'

490

print >>fp,' return -1.0;'

print >>fp,'}'

print >>fp

# Generate code for _PyUnicode_IsWhitespace()

495

print >>fp, "/* Returns 1 for Unicode characters having the bidirectional"

496

print >>fp, " * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise."

497

print >>fp, " */"

498

print >>fp, 'int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)'

499

print >>fp, '{'

500

print >>fp, '#ifdef WANT_WCTYPE_FUNCTIONS'

501

print >>fp, ' return iswspace(ch);'

502

print >>fp, '#else'

503

print >>fp, ' switch (ch) {'

504

505

haswide = False

506

hasnonewide = False

Florent Xicluna

2010-03-15 14:00:58 +0000

[diff] [blame]

507

for codepoint in sorted(spaces):

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

508

if codepoint < 0x10000:

509

hasnonewide = True

510

if codepoint >= 0x10000 and not haswide:

511

print >>fp, '#ifdef Py_UNICODE_WIDE'

512

haswide = True

513

print >>fp, ' case 0x%04X:' % (codepoint,)

514

if haswide and hasnonewide:

515

print >>fp, '#endif'

516

print >>fp, ' return 1;'

517

if haswide and not hasnonewide:

print >>fp, '#endif'

print >>fp,' }'

print >>fp,' return 0;'

print >>fp, '#endif'

print >>fp,'}'

print >>fp

# Generate code for _PyUnicode_IsLinebreak()

527

print >>fp, "/* Returns 1 for Unicode characters having the category 'Zl',"

528

print >>fp, " * 'Zp' or type 'B', 0 otherwise."

529

print >>fp, " */"

530

print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)'

531

print >>fp, '{'

532

print >>fp, ' switch (ch) {'

533

haswide = False

534

hasnonewide = False

Florent Xicluna

2010-03-15 14:00:58 +0000

[diff] [blame]

535

for codepoint in sorted(linebreaks):

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

536

if codepoint < 0x10000:

537

hasnonewide = True

538

if codepoint >= 0x10000 and not haswide:

539

print >>fp, '#ifdef Py_UNICODE_WIDE'

540

haswide = True

541

print >>fp, ' case 0x%04X:' % (codepoint,)

542

if haswide and hasnonewide:

543

print >>fp, '#endif'

544

print >>fp, ' return 1;'

545

if haswide and not hasnonewide:

print >>fp, '#endif'

print >>fp,' }'

print >>fp,' return 0;'

print >>fp,'}'

print >>fp

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

553

fp.close()

554

555

# --------------------------------------------------------------------

556

# unicode name database

557

558

def makeunicodename(unicode, trace):

559

560

FILE = "Modules/unicodename_db.h"

561

562

print "--- Preparing", FILE, "..."

563

564

# collect names

565

names = [None] * len(unicode.chars)

566

567

for char in unicode.chars:

568

record = unicode.table[char]

569

if record:

570

name = record[1].strip()

571

if name and name[0] != "<":

572

names[char] = name + chr(0)

573

574

print len(filter(lambda n: n is not None, names)), "distinct names"

575

576

# collect unique words from names (note that we differ between

577

# words inside a sentence, and words ending a sentence. the

578

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

594

595

print n, "words in text;", b, "bytes"

596

597

wordlist = words.items()

598

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

599

# sort on falling frequency, then by name

Florent Xicluna

2010-03-15 14:00:58 +0000

[diff] [blame]

600

def word_key(a):

601

aword, alist = a

602

return -len(alist), aword

603

wordlist.sort(key=word_key)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

604

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

605

# figure out how many phrasebook escapes we need

606

escapes = 0

607

while escapes * 256 < len(wordlist):

608

escapes = escapes + 1

609

print escapes, "escapes"

610

611

short = 256 - escapes

assert short > 0

print short, "short indexes in lexicon"

616

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

617

# statistics

618

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

619

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

620

n = n + len(wordlist[i][1])

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

621

print n, "short indexes in phrasebook"

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

622

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

623

# pick the most commonly used words, and sort the rest on falling

624

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

625

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

626

wordlist, wordtail = wordlist[:short], wordlist[short:]

Florent Xicluna

2010-03-15 14:00:58 +0000

[diff] [blame]

627

wordtail.sort(key=lambda a: a[0], reverse=True)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

628

wordlist.extend(wordtail)

629

630

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

637

offset = 0

638

for w, x in wordlist:

639

# encoding: bit 7 indicates last character in word (chr(128)

640

# indicates the last character in an entire string)

641

ww = w[:-1] + chr(ord(w[-1])+128)

642

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

643

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

644

if o < 0:

645

o = offset

646

lexicon = lexicon + ww

647

offset = offset + len(w)

648

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

649

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

650

651

lexicon = map(ord, lexicon)

652

653

# generate phrasebook from names and lexicon

654

phrasebook = [0]

655

phrasebook_offset = [0] * len(unicode.chars)

656

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

661

for w in w:

662

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

663

if i < short:

664

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

665

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

666

# store as two bytes

667

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

668

phrasebook.append(i&255)

669

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

670

assert getsize(phrasebook) == 1

671

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

672

#

673

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

678

record = unicode.table[char]

679

if record:

680

name = record[1].strip()

681

if name and name[0] != "<":

682

data.append((name, char))

683

684

# the magic number 47 was chosen to minimize the number of

685

# collisions on the current data set. if you like, change it

686

# and see what happens...

687

688

codehash = Hash("code", data, 47)

689

690

print "--- Writing", FILE, "..."

691

692

fp = open(FILE, "w")

693

print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)

694

print >>fp

695

print >>fp, "#define NAME_MAXLEN", 256

696

print >>fp

697

print >>fp, "/* lexicon */"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

698

Array("lexicon", lexicon).dump(fp, trace)

699

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

700

701

# split decomposition index table

702

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

703

704

print >>fp, "/* code->name phrasebook */"

705

print >>fp, "#define phrasebook_shift", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

706

print >>fp, "#define phrasebook_short", short

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

707

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

708

Array("phrasebook", phrasebook).dump(fp, trace)

709

Array("phrasebook_offset1", offset1).dump(fp, trace)

710

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

711

712

print >>fp, "/* name->code dictionary */"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

713

codehash.dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

fp.close()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

717

718

def merge_old_version(version, new, old):

719

# Changes to exclusion file not implemented yet

720

if old.exclusions != new.exclusions:

721

raise NotImplementedError, "exclusions differ"

722

723

# In these change records, 0xFF means "no change"

724

bidir_changes = [0xFF]*0x110000

725

category_changes = [0xFF]*0x110000

726

decimal_changes = [0xFF]*0x110000

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

727

mirrored_changes = [0xFF]*0x110000

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

728

# In numeric data, 0 means "no change",

729

# -1 means "did not have a numeric value

730

numeric_changes = [0] * 0x110000

731

# normalization_changes is a list of key-value pairs

732

normalization_changes = []

733

for i in range(0x110000):

734

if new.table[i] is None:

735

# Characters unassigned in the new version ought to

736

# be unassigned in the old one

737

assert old.table[i] is None

738

continue

739

# check characters unassigned in the old version

740

if old.table[i] is None:

741

# category 0 is "unassigned"

742

category_changes[i] = 0

743

continue

744

# check characters that differ

745

if old.table[i] != new.table[i]:

746

for k in range(len(old.table[i])):

747

if old.table[i][k] != new.table[i][k]:

748

value = old.table[i][k]

749

if k == 2:

750

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

751

category_changes[i] = CATEGORY_NAMES.index(value)

752

elif k == 4:

753

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

754

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

755

elif k == 5:

756

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

757

# We assume that all normalization changes are in 1:1 mappings

758

assert " " not in value

759

normalization_changes.append((i, value))

760

elif k == 6:

761

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

762

# we only support changes where the old value is a single digit

763

assert value in "0123456789"

764

decimal_changes[i] = int(value)

765

elif k == 8:

766

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

767

# Since 0 encodes "no change", the old value is better not 0

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

768

if not value:

769

numeric_changes[i] = -1

770

else:

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

771

numeric_changes[i] = float(value)

772

assert numeric_changes[i] not in (0, -1)

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

773

elif k == 9:

774

if value == 'Y':

775

mirrored_changes[i] = '1'

776

else:

777

mirrored_changes[i] = '0'

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

778

elif k == 11:

779

# change to ISO comment, ignore

780

pass

781

elif k == 12:

782

# change to simple uppercase mapping; ignore

783

pass

784

elif k == 13:

785

# change to simple lowercase mapping; ignore

786

pass

787

elif k == 14:

788

# change to simple titlecase mapping; ignore

789

pass

790

else:

791

class Difference(Exception):pass

792

raise Difference, (hex(i), k, old.table[i], new.table[i])

793

new.changed.append((version, zip(bidir_changes, category_changes,

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

794

decimal_changes, mirrored_changes,

795

numeric_changes),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

796

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

797

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

798

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

799

# --------------------------------------------------------------------

800

# the following support code is taken from the unidb utilities

801

802

803

# load a unicode-data file from disk

804

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

805

class UnicodeData:

806

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

807

def __init__(self, filename, exclusions, eastasianwidth, unihan,

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

808

derivednormalizationprops=None, expand=1):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

809

self.changed = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

810

file = open(filename)

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

811

table = [None] * 0x110000

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

while 1:

s = file.readline()

if not s:

break

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

816

s = s.strip().split(";")

817

char = int(s[0], 16)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

818

table[char] = s

819

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

820

# expand first-last ranges

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

821

if expand:

822

field = None

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

823

for i in range(0, 0x110000):

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

824

s = table[i]

825

if s:

826

if s[1][-6:] == "First>":

827

s[1] = ""

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

828

field = s

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

829

elif s[1][-5:] == "Last>":

830

s[1] = ""

831

field = None

832

elif field:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

833

f2 = field[:]

834

f2[0] = "%X" % i

835

table[i] = f2

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

836

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

837

# public attributes

838

self.filename = filename

839

self.table = table

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

840

self.chars = range(0x110000) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

841

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

842

file = open(exclusions)

self.exclusions = {}

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

char = int(s.split()[0],16)

851

self.exclusions[char] = 1

852

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

853

widths = [None] * 0x110000

854

for s in open(eastasianwidth):

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

s = s.split()[0].split(';')

861

if '..' in s[0]:

862

first, last = [int(c, 16) for c in s[0].split('..')]

863

chars = range(first, last+1)

864

else:

865

chars = [int(s[0], 16)]

866

for char in chars:

867

widths[char] = s[1]

868

for i in range(0, 0x110000):

869

if table[i] is not None:

870

table[i].append(widths[i])

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

871

if derivednormalizationprops:

872

quickchecks = [0] * 0x110000 # default is Yes

873

qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()

874

for s in open(derivednormalizationprops):

875

if '#' in s:

876

s = s[:s.index('#')]

877

s = [i.strip() for i in s.split(';')]

878

if len(s) < 2 or s[1] not in qc_order:

879

continue

880

quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No

881

quickcheck_shift = qc_order.index(s[1])*2

882

quickcheck <<= quickcheck_shift

883

if '..' not in s[0]:

884

first = last = int(s[0], 16)

885

else:

886

first, last = [int(c, 16) for c in s[0].split('..')]

887

for char in range(first, last+1):

888

assert not (quickchecks[char]>>quickcheck_shift)&3

889

quickchecks[char] |= quickcheck

890

for i in range(0, 0x110000):

891

if table[i] is not None:

892

table[i].append(quickchecks[i])

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

893

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

894

for line in open(unihan):

895

if not line.startswith('U+'):

896

continue

897

code, tag, value = line.split(None, 3)[:3]

898

if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',

899

'kOtherNumeric'):

900

continue

901

value = value.strip().replace(',', '')

902

i = int(code[2:], 16)

903

# Patch the numeric field

904

if table[i] is not None:

905

table[i][8] = value

906

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

907

def uselatin1(self):

908

# restrict character range to ISO Latin 1

909

self.chars = range(256)

910

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

911

# hash table tools

912

913

# this is a straight-forward reimplementation of Python's built-in

914

# dictionary type, using a static data structure, and a custom string

915

# hash algorithm.

916

917

def myhash(s, magic):

918

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

919

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

920

h = (h * magic) + c

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

921

ix = h & 0xff000000L

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

922

if ix:

923

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

return h

SIZES = [

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

928

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

929

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

930

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

]

class Hash:

def __init__(self, name, data, magic):

935

# turn a (key, value) list into a static hash table structure

936

937

# determine table size

938

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

raise AssertionError, "ran out of polynominals"

944

945

print size, "slots in hash table"

946

947

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

956

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

incr = (h ^ (h >> 3)) & mask;

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

print n, "collisions"

978

self.collisions = n

979

980

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

990

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

991

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

992

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

993

file.write("#define %s_magic %d\n" % (self.name, self.magic))

994

file.write("#define %s_size %d\n" % (self.name, self.size))

995

file.write("#define %s_poly %d\n" % (self.name, self.poly))

996

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

997

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1005

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1006

# write data to file, as a C array

1007

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1008

if trace:

1009

print >>sys.stderr, self.name+":", size*len(self.data), "bytes"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1010

file.write("static ")

1011

if size == 1:

1012

file.write("unsigned char")

1013

elif size == 2:

1014

file.write("unsigned short")

1015

else:

1016

file.write("unsigned int")

1017

file.write(" " + self.name + "[] = {\n")

1018

if self.data:

1019

s = " "

1020

for item in self.data:

1021

i = str(item) + ", "

1022

if len(s) + len(i) > 78:

file.write(s + "\n")

s = " " + i

else:

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1027

if s.strip():

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

file.write(s + "\n")

file.write("};\n\n")

def getsize(data):

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1041

def splitbins(t, trace=0):

1042

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

1043

1044

t is a sequence of ints. This function can be useful to save space if

1045

many of the ints are the same. t1 and t2 are lists of ints, and shift

1046

is an int, chosen to minimize the combined size of t1 and t2 (in C

1047

code), and where for each i in range(len(t)),

1048

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1049

where mask is a bitmask isolating the last "shift" bits.

1050

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1051

If optional arg trace is non-zero (default zero), progress info

1052

is printed to sys.stderr. The higher the value, the more info

1053

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1054

"""

1055

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1056

if trace:

1057

def dump(t1, t2, shift, bytes):

1058

print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (

1059

len(t1), len(t2), shift, bytes)

1060

print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \

1061

"bytes"

1062

n = len(t)-1 # last valid index

1063

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

bytes = sys.maxint # smallest total size so far

1070

t = tuple(t) # so slices can be dict keys

1071

for shift in range(maxshift + 1):

1072

t1 = []

1073

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1074

size = 2**shift

1075

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1076

for i in range(0, len(t), size):

1077

bin = t[i:i+size]

1078

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1079

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1080

index = len(t2)

1081

bincache[bin] = index

1082

t2.extend(bin)

1083

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1084

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1085

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1086

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1087

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1088

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1089

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1090

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1091

t1, t2, shift = best

1092

if trace:

1093

print >>sys.stderr, "Best:",

1094

dump(t1, t2, shift, bytes)

1095

if __debug__:

1096

# exhaustively verify that the decomposition is correct

1097

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

1098

for i in xrange(len(t)):

1099

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1100

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1101

1102

if __name__ == "__main__":

Fredrik Lundh