Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython2

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

4

# this script converts a unicode 3.2 database file to

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

5

# Modules/unicodedata_db.h, Modules/unicodename_db.h,

6

# and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

7

#

8

# history:

9

# 2000-09-24 fl created (based on bits and pieces from unidb)

10

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

11

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

12

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

13

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

14

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

15

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

16

# 2002-09-11 wd use string methods

17

# 2002-10-18 mvl update to Unicode 3.2

18

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

19

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

20

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

21

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

22

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

23

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

24

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

#

import sys

SCRIPT = sys.argv[0]

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

30

VERSION = "2.6"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

31

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

32

# The Unicode Database

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

33

UNIDATA_VERSION = "5.1.0"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

34

UNICODE_DATA = "UnicodeData%s.txt"

35

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

36

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

37

DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

38

39

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

40

41

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

42

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

43

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

44

"So" ]

45

46

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

47

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

48

"ON" ]

49

Hye-Shik Chang

e9ddfbb

2004-08-04 07:38:35 +0000

[diff] [blame]

50

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

51

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

52

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

57

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

58

SPACE_MASK = 0x20

59

TITLE_MASK = 0x40

60

UPPER_MASK = 0x80

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

61

NODELTA_MASK = 0x100

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

62

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

63

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

64

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

65

print "--- Reading", UNICODE_DATA % "", "..."

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

66

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

67

version = ""

68

unicode = UnicodeData(UNICODE_DATA % version,

69

COMPOSITION_EXCLUSIONS % version,

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

70

EASTASIAN_WIDTH % version,

71

DERIVEDNORMALIZATION_PROPS % version)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

72

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

73

print len(filter(None, unicode.table)), "characters"

74

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

75

for version in old_versions:

76

print "--- Reading", UNICODE_DATA % ("-"+version), "..."

77

old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),

78

COMPOSITION_EXCLUSIONS % ("-"+version),

79

EASTASIAN_WIDTH % ("-"+version))

80

print len(filter(None, old_unicode.table)), "characters"

81

merge_old_version(version, unicode, old_unicode)

82

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

83

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

84

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

85

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

86

87

# --------------------------------------------------------------------

88

# unicode character properties

89

90

def makeunicodedata(unicode, trace):

91

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

92

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

93

table = [dummy]

94

cache = {0: dummy}

95

index = [0] * len(unicode.chars)

96

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

97

FILE = "Modules/unicodedata_db.h"

98

99

print "--- Preparing", FILE, "..."

100

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

101

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

102

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

103

for char in unicode.chars:

104

record = unicode.table[char]

105

if record:

106

# extract database properties

107

category = CATEGORY_NAMES.index(record[2])

108

combining = int(record[3])

109

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

110

mirrored = record[9] == "Y"

Hye-Shik Chang

e9ddfbb

2004-08-04 07:38:35 +0000

[diff] [blame]

111

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

112

normalizationquickcheck = record[16]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

113

item = (

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

114

category, combining, bidirectional, mirrored, eastasianwidth,

115

normalizationquickcheck

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

116

)

117

# add entry to index and item tables

118

i = cache.get(item)

119

if i is None:

120

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

124

# 2) decomposition data

125

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

126

decomp_data = [0]

127

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

128

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

129

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

130

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

131

comp_pairs = []

132

comp_first = [None] * len(unicode.chars)

133

comp_last = [None] * len(unicode.chars)

134

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

135

for char in unicode.chars:

136

record = unicode.table[char]

137

if record:

138

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

139

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

140

if len(decomp) > 19:

141

raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

142

# prefix

143

if decomp[0][0] == "<":

144

prefix = decomp.pop(0)

145

else:

146

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

147

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

148

i = decomp_prefix.index(prefix)

149

except ValueError:

150

i = len(decomp_prefix)

151

decomp_prefix.append(prefix)

prefix = i

assert prefix < 256

# content

decomp = [prefix + (len(decomp)<<8)] +\

156

map(lambda s: int(s, 16), decomp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

157

# Collect NFC pairs

158

if not prefix and len(decomp) == 3 and \

159

char not in unicode.exclusions and \

160

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

165

try:

166

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

167

except ValueError:

168

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

169

decomp_data.extend(decomp)

170

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

171

else:

172

i = 0

173

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

174

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

175

f = l = 0

176

comp_first_ranges = []

177

comp_last_ranges = []

178

prev_f = prev_l = None

179

for i in unicode.chars:

180

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

186

prev_f = prev_f[0],i

187

else:

188

comp_first_ranges.append(prev_f)

189

prev_f = (i,i)

190

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

196

prev_l = prev_l[0],i

197

else:

198

comp_last_ranges.append(prev_l)

199

prev_l = (i,i)

200

comp_first_ranges.append(prev_f)

201

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

206

for f,l,char in comp_pairs:

207

f = comp_first[f]

208

l = comp_last[l]

209

comp_data[f*total_last+l] = char

210

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

211

print len(table), "unique properties"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

212

print len(decomp_prefix), "unique decomposition prefixes"

213

print len(decomp_data), "unique decomposition entries:",

214

print decomp_size, "bytes"

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

215

print total_first, "first characters in NFC"

216

print total_last, "last characters in NFC"

217

print len(comp_pairs), "NFC pairs"

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

218

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

219

print "--- Writing", FILE, "..."

220

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

221

fp = open(FILE, "w")

222

print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)

223

print >>fp

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

224

print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

225

print >>fp, "/* a list of unique database records */"

226

print >>fp, \

227

"const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

228

for item in table:

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

229

print >>fp, " {%d, %d, %d, %d, %d, %d}," % item

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

230

print >>fp, "};"

231

print >>fp

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

232

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

233

print >>fp, "/* Reindexing of NFC first characters. */"

234

print >>fp, "#define TOTAL_FIRST",total_first

235

print >>fp, "#define TOTAL_LAST",total_last

236

print >>fp, "struct reindex{int start;short count,index;};"

Martin v. Löwis

111c180

2008-06-13 07:47:47 +0000

[diff] [blame]

237

print >>fp, "static struct reindex nfc_first[] = {"

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

238

for start,end in comp_first_ranges:

239

print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])

240

print >>fp," {0,0,0}"

241

print >>fp,"};\n"

Martin v. Löwis

111c180

2008-06-13 07:47:47 +0000

[diff] [blame]

242

print >>fp, "static struct reindex nfc_last[] = {"

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

243

for start,end in comp_last_ranges:

244

print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])

245

print >>fp," {0,0,0}"

246

print >>fp,"};\n"

247

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

248

# FIXME: <fl> the following tables could be made static, and

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

249

# the support code moved into unicodedatabase.c

250

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

251

print >>fp, "/* string literals */"

252

print >>fp, "const char *_PyUnicode_CategoryNames[] = {"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

253

for name in CATEGORY_NAMES:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

254

print >>fp, " \"%s\"," % name

255

print >>fp, " NULL"

256

print >>fp, "};"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

257

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

258

print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

259

for name in BIDIRECTIONAL_NAMES:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

260

print >>fp, " \"%s\"," % name

261

print >>fp, " NULL"

262

print >>fp, "};"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

263

Hye-Shik Chang

e9ddfbb

2004-08-04 07:38:35 +0000

[diff] [blame]

264

print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"

265

for name in EASTASIANWIDTH_NAMES:

266

print >>fp, " \"%s\"," % name

print >>fp, " NULL"

print >>fp, "};"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

270

print >>fp, "static const char *decomp_prefix[] = {"

271

for name in decomp_prefix:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

272

print >>fp, " \"%s\"," % name

273

print >>fp, " NULL"

274

print >>fp, "};"

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

275

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

276

# split record index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

277

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

278

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

279

print >>fp, "/* index tables for the database records */"

280

print >>fp, "#define SHIFT", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

281

Array("index1", index1).dump(fp, trace)

282

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

283

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

284

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

285

index1, index2, shift = splitbins(decomp_index, trace)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

286

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

287

print >>fp, "/* decomposition data */"

288

Array("decomp_data", decomp_data).dump(fp, trace)

289

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

290

print >>fp, "/* index tables for the decomposition data */"

291

print >>fp, "#define DECOMP_SHIFT", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

292

Array("decomp_index1", index1).dump(fp, trace)

293

Array("decomp_index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

294

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

295

index, index2, shift = splitbins(comp_data, trace)

296

print >>fp, "/* NFC pairs */"

297

print >>fp, "#define COMP_SHIFT", shift

298

Array("comp_index", index).dump(fp, trace)

299

Array("comp_data", index2).dump(fp, trace)

300

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

301

# Generate delta tables for old versions

302

for version, table, normalization in unicode.changed:

303

cversion = version.replace(".","_")

304

records = [table[0]]

305

cache = {table[0]:0}

306

index = [0] * len(table)

307

for i, record in enumerate(table):

308

try:

309

index[i] = cache[record]

310

except KeyError:

311

index[i] = cache[record] = len(records)

312

records.append(record)

313

index1, index2, shift = splitbins(index, trace)

314

print >>fp, "static const change_record change_records_%s[] = {" % cversion

315

for record in records:

316

print >>fp, "\t{ %s }," % ", ".join(map(str,record))

317

print >>fp, "};"

318

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

319

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

320

print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion

321

print >>fp, "{"

322

print >>fp, "\tint index;"

323

print >>fp, "\tif (n >= 0x110000) index = 0;"

324

print >>fp, "\telse {"

325

print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)

326

print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \

327

(cversion, shift, ((1<<shift)-1))

328

print >>fp, "\t}"

329

print >>fp, "\treturn change_records_%s+index;" % cversion

330

print >>fp, "}\n"

331

print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion

332

print >>fp, "{"

333

print >>fp, "\tswitch(n) {"

334

for k, v in normalization:

335

print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)

336

print >>fp, "\tdefault: return 0;"

337

print >>fp, "\t}\n}\n"

338

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

339

fp.close()

340

341

# --------------------------------------------------------------------

342

# unicode character type tables

343

344

def makeunicodetype(unicode, trace):

345

346

FILE = "Objects/unicodetype_db.h"

347

348

print "--- Preparing", FILE, "..."

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

349

350

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

351

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

352

table = [dummy]

353

cache = {0: dummy}

354

index = [0] * len(unicode.chars)

355

356

for char in unicode.chars:

357

record = unicode.table[char]

358

if record:

359

# extract database properties

360

category = record[2]

361

bidirectional = record[4]

362

flags = 0

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

363

delta = True

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

364

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

365

flags |= ALPHA_MASK

366

if category == "Ll":

367

flags |= LOWER_MASK

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

368

if category == "Zl" or bidirectional == "B":

369

flags |= LINEBREAK_MASK

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

370

if category == "Zs" or bidirectional in ("WS", "B", "S"):

371

flags |= SPACE_MASK

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

372

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

373

flags |= TITLE_MASK

374

if category == "Lu":

375

flags |= UPPER_MASK

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

376

# use delta predictor for upper/lower/title if it fits

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

377

if record[12]:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

378

upper = int(record[12], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

379

else:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

380

upper = char

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

381

if record[13]:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

382

lower = int(record[13], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

383

else:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

384

lower = char

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

385

if record[14]:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

386

title = int(record[14], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

387

else:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

388

# UCD.html says that a missing title char means that

389

# it defaults to the uppercase character, not to the

390

# character itself. Apparently, in the current UCD (5.x)

391

# this feature is never used

392

title = upper

393

upper_d = upper - char

394

lower_d = lower - char

395

title_d = title - char

396

if -32768 <= upper_d <= 32767 and \

397

-32768 <= lower_d <= 32767 and \

398

-32768 <= title_d <= 32767:

399

# use deltas

400

upper = upper_d & 0xffff

401

lower = lower_d & 0xffff

402

title = title_d & 0xffff

403

else:

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

404

flags |= NODELTA_MASK

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

405

# decimal digit, integer digit

406

decimal = 0

407

if record[6]:

408

flags |= DECIMAL_MASK

409

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

414

item = (

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

415

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

416

)

417

# add entry to index and item tables

418

i = cache.get(item)

419

if i is None:

420

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

424

print len(table), "unique character type entries"

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

425

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

426

print "--- Writing", FILE, "..."

427

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

428

fp = open(FILE, "w")

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

429

print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)

430

print >>fp

431

print >>fp, "/* a list of unique character type descriptors */"

432

print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

433

for item in table:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

434

print >>fp, " {%d, %d, %d, %d, %d, %d}," % item

435

print >>fp, "};"

436

print >>fp

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

437

438

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

439

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

440

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

441

print >>fp, "/* type indexes */"

442

print >>fp, "#define SHIFT", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

443

Array("index1", index1).dump(fp, trace)

444

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

445

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

446

fp.close()

447

448

# --------------------------------------------------------------------

449

# unicode name database

450

451

def makeunicodename(unicode, trace):

452

453

FILE = "Modules/unicodename_db.h"

454

455

print "--- Preparing", FILE, "..."

456

457

# collect names

458

names = [None] * len(unicode.chars)

459

460

for char in unicode.chars:

461

record = unicode.table[char]

462

if record:

463

name = record[1].strip()

464

if name and name[0] != "<":

465

names[char] = name + chr(0)

466

467

print len(filter(lambda n: n is not None, names)), "distinct names"

468

469

# collect unique words from names (note that we differ between

470

# words inside a sentence, and words ending a sentence. the

471

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

487

488

print n, "words in text;", b, "bytes"

489

490

wordlist = words.items()

491

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

492

# sort on falling frequency, then by name

493

def cmpwords((aword, alist),(bword, blist)):

494

r = -cmp(len(alist),len(blist))

495

if r:

496

return r

497

return cmp(aword, bword)

498

wordlist.sort(cmpwords)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

499

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

500

# figure out how many phrasebook escapes we need

501

escapes = 0

502

while escapes * 256 < len(wordlist):

503

escapes = escapes + 1

504

print escapes, "escapes"

505

506

short = 256 - escapes

assert short > 0

print short, "short indexes in lexicon"

511

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

512

# statistics

513

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

514

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

515

n = n + len(wordlist[i][1])

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

516

print n, "short indexes in phrasebook"

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

517

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

518

# pick the most commonly used words, and sort the rest on falling

519

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

520

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

521

wordlist, wordtail = wordlist[:short], wordlist[short:]

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

522

wordtail.sort(lambda a, b: len(b[0])-len(a[0]))

523

wordlist.extend(wordtail)

524

525

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

532

offset = 0

533

for w, x in wordlist:

534

# encoding: bit 7 indicates last character in word (chr(128)

535

# indicates the last character in an entire string)

536

ww = w[:-1] + chr(ord(w[-1])+128)

537

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

538

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

539

if o < 0:

540

o = offset

541

lexicon = lexicon + ww

542

offset = offset + len(w)

543

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

544

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

545

546

lexicon = map(ord, lexicon)

547

548

# generate phrasebook from names and lexicon

549

phrasebook = [0]

550

phrasebook_offset = [0] * len(unicode.chars)

551

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

556

for w in w:

557

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

558

if i < short:

559

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

560

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

561

# store as two bytes

562

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

563

phrasebook.append(i&255)

564

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

565

assert getsize(phrasebook) == 1

566

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

567

#

568

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

573

record = unicode.table[char]

574

if record:

575

name = record[1].strip()

576

if name and name[0] != "<":

577

data.append((name, char))

578

579

# the magic number 47 was chosen to minimize the number of

580

# collisions on the current data set. if you like, change it

581

# and see what happens...

582

583

codehash = Hash("code", data, 47)

584

585

print "--- Writing", FILE, "..."

586

587

fp = open(FILE, "w")

588

print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)

589

print >>fp

590

print >>fp, "#define NAME_MAXLEN", 256

591

print >>fp

592

print >>fp, "/* lexicon */"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

593

Array("lexicon", lexicon).dump(fp, trace)

594

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

595

596

# split decomposition index table

597

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

598

599

print >>fp, "/* code->name phrasebook */"

600

print >>fp, "#define phrasebook_shift", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

601

print >>fp, "#define phrasebook_short", short

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

602

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

603

Array("phrasebook", phrasebook).dump(fp, trace)

604

Array("phrasebook_offset1", offset1).dump(fp, trace)

605

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

606

607

print >>fp, "/* name->code dictionary */"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

608

codehash.dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

fp.close()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

612

613

def merge_old_version(version, new, old):

614

# Changes to exclusion file not implemented yet

615

if old.exclusions != new.exclusions:

616

raise NotImplementedError, "exclusions differ"

617

618

# In these change records, 0xFF means "no change"

619

bidir_changes = [0xFF]*0x110000

620

category_changes = [0xFF]*0x110000

621

decimal_changes = [0xFF]*0x110000

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

622

mirrored_changes = [0xFF]*0x110000

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

623

# In numeric data, 0 means "no change",

624

# -1 means "did not have a numeric value

625

numeric_changes = [0] * 0x110000

626

# normalization_changes is a list of key-value pairs

627

normalization_changes = []

628

for i in range(0x110000):

629

if new.table[i] is None:

630

# Characters unassigned in the new version ought to

631

# be unassigned in the old one

632

assert old.table[i] is None

633

continue

634

# check characters unassigned in the old version

635

if old.table[i] is None:

636

# category 0 is "unassigned"

637

category_changes[i] = 0

638

continue

639

# check characters that differ

640

if old.table[i] != new.table[i]:

641

for k in range(len(old.table[i])):

642

if old.table[i][k] != new.table[i][k]:

643

value = old.table[i][k]

644

if k == 2:

645

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

646

category_changes[i] = CATEGORY_NAMES.index(value)

647

elif k == 4:

648

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

649

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

650

elif k == 5:

651

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

652

# We assume that all normalization changes are in 1:1 mappings

653

assert " " not in value

654

normalization_changes.append((i, value))

655

elif k == 6:

656

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

657

# we only support changes where the old value is a single digit

658

assert value in "0123456789"

659

decimal_changes[i] = int(value)

660

elif k == 8:

661

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

662

# Since 0 encodes "no change", the old value is better not 0

663

assert value != "0" and value != "-1"

664

if not value:

665

numeric_changes[i] = -1

666

else:

667

assert re.match("^[0-9]+$", value)

668

numeric_changes[i] = int(value)

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

669

elif k == 9:

670

if value == 'Y':

671

mirrored_changes[i] = '1'

672

else:

673

mirrored_changes[i] = '0'

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

674

elif k == 11:

675

# change to ISO comment, ignore

676

pass

677

elif k == 12:

678

# change to simple uppercase mapping; ignore

679

pass

680

elif k == 13:

681

# change to simple lowercase mapping; ignore

682

pass

683

elif k == 14:

684

# change to simple titlecase mapping; ignore

685

pass

686

else:

687

class Difference(Exception):pass

688

raise Difference, (hex(i), k, old.table[i], new.table[i])

689

new.changed.append((version, zip(bidir_changes, category_changes,

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

690

decimal_changes, mirrored_changes,

691

numeric_changes),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

692

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

693

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

694

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

695

# --------------------------------------------------------------------

696

# the following support code is taken from the unidb utilities

697

698

699

# load a unicode-data file from disk

700

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

701

import sys

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

class UnicodeData:

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

705

def __init__(self, filename, exclusions, eastasianwidth,

706

derivednormalizationprops=None, expand=1):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

707

self.changed = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

708

file = open(filename)

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

709

table = [None] * 0x110000

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

while 1:

s = file.readline()

if not s:

break

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

714

s = s.strip().split(";")

715

char = int(s[0], 16)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

716

table[char] = s

717

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

718

# expand first-last ranges

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

719

if expand:

720

field = None

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

721

for i in range(0, 0x110000):

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

722

s = table[i]

723

if s:

724

if s[1][-6:] == "First>":

725

s[1] = ""

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

726

field = s

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

727

elif s[1][-5:] == "Last>":

728

s[1] = ""

729

field = None

730

elif field:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

731

f2 = field[:]

732

f2[0] = "%X" % i

733

table[i] = f2

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

734

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

735

# public attributes

736

self.filename = filename

737

self.table = table

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

738

self.chars = range(0x110000) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

739

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

740

file = open(exclusions)

self.exclusions = {}

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

char = int(s.split()[0],16)

749

self.exclusions[char] = 1

750

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

751

widths = [None] * 0x110000

752

for s in open(eastasianwidth):

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

s = s.split()[0].split(';')

759

if '..' in s[0]:

760

first, last = [int(c, 16) for c in s[0].split('..')]

761

chars = range(first, last+1)

762

else:

763

chars = [int(s[0], 16)]

764

for char in chars:

765

widths[char] = s[1]

766

for i in range(0, 0x110000):

767

if table[i] is not None:

768

table[i].append(widths[i])

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

769

if derivednormalizationprops:

770

quickchecks = [0] * 0x110000 # default is Yes

771

qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()

772

for s in open(derivednormalizationprops):

773

if '#' in s:

774

s = s[:s.index('#')]

775

s = [i.strip() for i in s.split(';')]

776

if len(s) < 2 or s[1] not in qc_order:

777

continue

778

quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No

779

quickcheck_shift = qc_order.index(s[1])*2

780

quickcheck <<= quickcheck_shift

781

if '..' not in s[0]:

782

first = last = int(s[0], 16)

783

else:

784

first, last = [int(c, 16) for c in s[0].split('..')]

785

for char in range(first, last+1):

786

assert not (quickchecks[char]>>quickcheck_shift)&3

787

quickchecks[char] |= quickcheck

788

for i in range(0, 0x110000):

789

if table[i] is not None:

790

table[i].append(quickchecks[i])

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

791

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

792

def uselatin1(self):

793

# restrict character range to ISO Latin 1

794

self.chars = range(256)

795

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

796

# hash table tools

797

798

# this is a straight-forward reimplementation of Python's built-in

799

# dictionary type, using a static data structure, and a custom string

800

# hash algorithm.

801

802

def myhash(s, magic):

803

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

804

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

805

h = (h * magic) + c

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

806

ix = h & 0xff000000L

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

807

if ix:

808

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

return h

SIZES = [

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

813

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

814

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

815

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

]

class Hash:

def __init__(self, name, data, magic):

820

# turn a (key, value) list into a static hash table structure

821

822

# determine table size

823

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

raise AssertionError, "ran out of polynominals"

829

830

print size, "slots in hash table"

831

832

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

841

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

incr = (h ^ (h >> 3)) & mask;

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

print n, "collisions"

863

self.collisions = n

864

865

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

875

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

876

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

877

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

878

file.write("#define %s_magic %d\n" % (self.name, self.magic))

879

file.write("#define %s_size %d\n" % (self.name, self.size))

880

file.write("#define %s_poly %d\n" % (self.name, self.poly))

881

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

882

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

890

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

891

# write data to file, as a C array

892

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

893

if trace:

894

print >>sys.stderr, self.name+":", size*len(self.data), "bytes"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

895

file.write("static ")

896

if size == 1:

897

file.write("unsigned char")

898

elif size == 2:

899

file.write("unsigned short")

900

else:

901

file.write("unsigned int")

902

file.write(" " + self.name + "[] = {\n")

903

if self.data:

904

s = " "

905

for item in self.data:

906

i = str(item) + ", "

907

if len(s) + len(i) > 78:

file.write(s + "\n")

s = " " + i

else:

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

912

if s.strip():

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

file.write(s + "\n")

file.write("};\n\n")

def getsize(data):

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

926

def splitbins(t, trace=0):

927

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

928

929

t is a sequence of ints. This function can be useful to save space if

930

many of the ints are the same. t1 and t2 are lists of ints, and shift

931

is an int, chosen to minimize the combined size of t1 and t2 (in C

932

code), and where for each i in range(len(t)),

933

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

934

where mask is a bitmask isolating the last "shift" bits.

935

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

936

If optional arg trace is non-zero (default zero), progress info

937

is printed to sys.stderr. The higher the value, the more info

938

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

"""

import sys

if trace:

def dump(t1, t2, shift, bytes):

944

print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (

945

len(t1), len(t2), shift, bytes)

946

print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \

947

"bytes"

948

n = len(t)-1 # last valid index

949

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

bytes = sys.maxint # smallest total size so far

956

t = tuple(t) # so slices can be dict keys

957

for shift in range(maxshift + 1):

958

t1 = []

959

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

960

size = 2**shift

961

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

962

for i in range(0, len(t), size):

963

bin = t[i:i+size]

964

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

965

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

966

index = len(t2)

967

bincache[bin] = index

968

t2.extend(bin)

969

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

970

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

971

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

972

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

973

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

974

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

975

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

976

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

977

t1, t2, shift = best

978

if trace:

979

print >>sys.stderr, "Best:",

980

dump(t1, t2, shift, bytes)

981

if __debug__:

982

# exhaustively verify that the decomposition is correct

983

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

984

for i in xrange(len(t)):

985

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

986

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

987

988

if __name__ == "__main__":

Fredrik Lundh