Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython2

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

4

# this script converts a unicode 3.2 database file to

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

5

# Modules/unicodedata_db.h, Modules/unicodename_db.h,

6

# and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

7

#

8

# history:

9

# 2000-09-24 fl created (based on bits and pieces from unidb)

10

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

11

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

12

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

13

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

14

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

15

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

16

# 2002-09-11 wd use string methods

17

# 2002-10-18 mvl update to Unicode 3.2

18

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

19

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

20

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

21

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

22

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

23

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

24

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

#

import sys

SCRIPT = sys.argv[0]

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame^]

30

VERSION = "2.6"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

31

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

32

# The Unicode Database

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame^]

33

UNIDATA_VERSION = "5.1.0"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

34

UNICODE_DATA = "UnicodeData%s.txt"

35

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

36

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

37

38

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

39

40

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

41

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

42

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

43

"So" ]

44

45

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

46

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

47

"ON" ]

48

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

49

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

50

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

51

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

56

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

57

SPACE_MASK = 0x20

58

TITLE_MASK = 0x40

59

UPPER_MASK = 0x80

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame^]

60

NODELTA_MASK = 0x100

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

61

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

62

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

63

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

64

print "--- Reading", UNICODE_DATA % "", "..."

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

65

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

66

version = ""

67

unicode = UnicodeData(UNICODE_DATA % version,

68

COMPOSITION_EXCLUSIONS % version,

69

EASTASIAN_WIDTH % version)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

70

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

71

print len(filter(None, unicode.table)), "characters"

72

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

73

for version in old_versions:

74

print "--- Reading", UNICODE_DATA % ("-"+version), "..."

75

old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),

76

COMPOSITION_EXCLUSIONS % ("-"+version),

77

EASTASIAN_WIDTH % ("-"+version))

78

print len(filter(None, old_unicode.table)), "characters"

79

merge_old_version(version, unicode, old_unicode)

80

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

81

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

82

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

83

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

84

85

# --------------------------------------------------------------------

86

# unicode character properties

87

88

def makeunicodedata(unicode, trace):

89

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

90

dummy = (0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

91

table = [dummy]

92

cache = {0: dummy}

93

index = [0] * len(unicode.chars)

94

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

95

FILE = "Modules/unicodedata_db.h"

96

97

print "--- Preparing", FILE, "..."

98

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

99

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

100

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

101

for char in unicode.chars:

102

record = unicode.table[char]

103

if record:

104

# extract database properties

105

category = CATEGORY_NAMES.index(record[2])

106

combining = int(record[3])

107

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

108

mirrored = record[9] == "Y"

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

109

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

110

item = (

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

111

category, combining, bidirectional, mirrored, eastasianwidth

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

112

)

113

# add entry to index and item tables

114

i = cache.get(item)

115

if i is None:

116

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

120

# 2) decomposition data

121

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

122

decomp_data = [0]

123

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

124

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

125

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

126

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

127

comp_pairs = []

128

comp_first = [None] * len(unicode.chars)

129

comp_last = [None] * len(unicode.chars)

130

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

131

for char in unicode.chars:

132

record = unicode.table[char]

133

if record:

134

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

135

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

136

if len(decomp) > 19:

137

raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

138

# prefix

139

if decomp[0][0] == "<":

140

prefix = decomp.pop(0)

141

else:

142

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

143

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

144

i = decomp_prefix.index(prefix)

145

except ValueError:

146

i = len(decomp_prefix)

147

decomp_prefix.append(prefix)

prefix = i

assert prefix < 256

# content

decomp = [prefix + (len(decomp)<<8)] +\

152

map(lambda s: int(s, 16), decomp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

153

# Collect NFC pairs

154

if not prefix and len(decomp) == 3 and \

155

char not in unicode.exclusions and \

156

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

161

try:

162

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

163

except ValueError:

164

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

165

decomp_data.extend(decomp)

166

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

167

else:

168

i = 0

169

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

170

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

171

f = l = 0

172

comp_first_ranges = []

173

comp_last_ranges = []

174

prev_f = prev_l = None

175

for i in unicode.chars:

176

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

182

prev_f = prev_f[0],i

183

else:

184

comp_first_ranges.append(prev_f)

185

prev_f = (i,i)

186

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

192

prev_l = prev_l[0],i

193

else:

194

comp_last_ranges.append(prev_l)

195

prev_l = (i,i)

196

comp_first_ranges.append(prev_f)

197

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

202

for f,l,char in comp_pairs:

203

f = comp_first[f]

204

l = comp_last[l]

205

comp_data[f*total_last+l] = char

206

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

207

print len(table), "unique properties"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

208

print len(decomp_prefix), "unique decomposition prefixes"

209

print len(decomp_data), "unique decomposition entries:",

210

print decomp_size, "bytes"

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

211

print total_first, "first characters in NFC"

212

print total_last, "last characters in NFC"

213

print len(comp_pairs), "NFC pairs"

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

214

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

215

print "--- Writing", FILE, "..."

216

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

217

fp = open(FILE, "w")

218

print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)

219

print >>fp

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

220

print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

221

print >>fp, "/* a list of unique database records */"

222

print >>fp, \

223

"const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

224

for item in table:

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

225

print >>fp, " {%d, %d, %d, %d, %d}," % item

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

226

print >>fp, "};"

227

print >>fp

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

228

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

229

print >>fp, "/* Reindexing of NFC first characters. */"

230

print >>fp, "#define TOTAL_FIRST",total_first

231

print >>fp, "#define TOTAL_LAST",total_last

232

print >>fp, "struct reindex{int start;short count,index;};"

Martin v. Löwis

111c180

2008-06-13 07:47:47 +0000

[diff] [blame]

233

print >>fp, "static struct reindex nfc_first[] = {"

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

234

for start,end in comp_first_ranges:

235

print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])

236

print >>fp," {0,0,0}"

237

print >>fp,"};\n"

Martin v. Löwis

111c180

2008-06-13 07:47:47 +0000

[diff] [blame]

238

print >>fp, "static struct reindex nfc_last[] = {"

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

239

for start,end in comp_last_ranges:

240

print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])

241

print >>fp," {0,0,0}"

242

print >>fp,"};\n"

243

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

244

# FIXME: <fl> the following tables could be made static, and

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

245

# the support code moved into unicodedatabase.c

246

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

247

print >>fp, "/* string literals */"

248

print >>fp, "const char *_PyUnicode_CategoryNames[] = {"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

249

for name in CATEGORY_NAMES:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

250

print >>fp, " \"%s\"," % name

251

print >>fp, " NULL"

252

print >>fp, "};"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

253

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

254

print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

255

for name in BIDIRECTIONAL_NAMES:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

256

print >>fp, " \"%s\"," % name

257

print >>fp, " NULL"

258

print >>fp, "};"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

259

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

260

print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"

261

for name in EASTASIANWIDTH_NAMES:

262

print >>fp, " \"%s\"," % name

print >>fp, " NULL"

print >>fp, "};"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

266

print >>fp, "static const char *decomp_prefix[] = {"

267

for name in decomp_prefix:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

268

print >>fp, " \"%s\"," % name

269

print >>fp, " NULL"

270

print >>fp, "};"

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

271

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

272

# split record index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

273

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

274

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

275

print >>fp, "/* index tables for the database records */"

276

print >>fp, "#define SHIFT", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

277

Array("index1", index1).dump(fp, trace)

278

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

279

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

280

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

281

index1, index2, shift = splitbins(decomp_index, trace)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

282

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

283

print >>fp, "/* decomposition data */"

284

Array("decomp_data", decomp_data).dump(fp, trace)

285

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

286

print >>fp, "/* index tables for the decomposition data */"

287

print >>fp, "#define DECOMP_SHIFT", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

288

Array("decomp_index1", index1).dump(fp, trace)

289

Array("decomp_index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

290

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

291

index, index2, shift = splitbins(comp_data, trace)

292

print >>fp, "/* NFC pairs */"

293

print >>fp, "#define COMP_SHIFT", shift

294

Array("comp_index", index).dump(fp, trace)

295

Array("comp_data", index2).dump(fp, trace)

296

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

297

# Generate delta tables for old versions

298

for version, table, normalization in unicode.changed:

299

cversion = version.replace(".","_")

300

records = [table[0]]

301

cache = {table[0]:0}

302

index = [0] * len(table)

303

for i, record in enumerate(table):

304

try:

305

index[i] = cache[record]

306

except KeyError:

307

index[i] = cache[record] = len(records)

308

records.append(record)

309

index1, index2, shift = splitbins(index, trace)

310

print >>fp, "static const change_record change_records_%s[] = {" % cversion

311

for record in records:

312

print >>fp, "\t{ %s }," % ", ".join(map(str,record))

313

print >>fp, "};"

314

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

315

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

316

print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion

317

print >>fp, "{"

318

print >>fp, "\tint index;"

319

print >>fp, "\tif (n >= 0x110000) index = 0;"

320

print >>fp, "\telse {"

321

print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)

322

print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \

323

(cversion, shift, ((1<<shift)-1))

324

print >>fp, "\t}"

325

print >>fp, "\treturn change_records_%s+index;" % cversion

326

print >>fp, "}\n"

327

print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion

328

print >>fp, "{"

329

print >>fp, "\tswitch(n) {"

330

for k, v in normalization:

331

print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)

332

print >>fp, "\tdefault: return 0;"

333

print >>fp, "\t}\n}\n"

334

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

335

fp.close()

336

337

# --------------------------------------------------------------------

338

# unicode character type tables

339

340

def makeunicodetype(unicode, trace):

341

342

FILE = "Objects/unicodetype_db.h"

343

344

print "--- Preparing", FILE, "..."

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

345

346

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

347

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

348

table = [dummy]

349

cache = {0: dummy}

350

index = [0] * len(unicode.chars)

351

352

for char in unicode.chars:

353

record = unicode.table[char]

354

if record:

355

# extract database properties

356

category = record[2]

357

bidirectional = record[4]

358

flags = 0

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame^]

359

delta = True

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

360

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

361

flags |= ALPHA_MASK

362

if category == "Ll":

363

flags |= LOWER_MASK

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

364

if category == "Zl" or bidirectional == "B":

365

flags |= LINEBREAK_MASK

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

366

if category == "Zs" or bidirectional in ("WS", "B", "S"):

367

flags |= SPACE_MASK

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

368

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

369

flags |= TITLE_MASK

370

if category == "Lu":

371

flags |= UPPER_MASK

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame^]

372

# use delta predictor for upper/lower/title if it fits

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

373

if record[12]:

Martin v. Löwis

99ac328

2002-10-18 17:34:18 +0000

[diff] [blame]

374

upper = int(record[12], 16) - char

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame^]

375

if -32768 <= upper <= 32767 and delta:

376

upper = upper & 0xffff

377

else:

378

upper += char

379

delta = False

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

380

else:

381

upper = 0

382

if record[13]:

Martin v. Löwis

99ac328

2002-10-18 17:34:18 +0000

[diff] [blame]

383

lower = int(record[13], 16) - char

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame^]

384

if -32768 <= lower <= 32767 and delta:

385

lower = lower & 0xffff

386

else:

387

lower += char

388

delta = False

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

389

else:

390

lower = 0

391

if record[14]:

Martin v. Löwis

99ac328

2002-10-18 17:34:18 +0000

[diff] [blame]

392

title = int(record[14], 16) - char

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame^]

393

if -32768 <= lower <= 32767 and delta:

394

title = title & 0xffff

395

else:

396

title += char

397

delta = False

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

398

else:

399

title = 0

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame^]

400

if not delta:

401

flags |= NODELTA_MASK

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

402

# decimal digit, integer digit

403

decimal = 0

404

if record[6]:

405

flags |= DECIMAL_MASK

406

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

411

item = (

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

412

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

413

)

414

# add entry to index and item tables

415

i = cache.get(item)

416

if i is None:

417

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

421

print len(table), "unique character type entries"

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

422

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

423

print "--- Writing", FILE, "..."

424

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

425

fp = open(FILE, "w")

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

426

print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)

427

print >>fp

428

print >>fp, "/* a list of unique character type descriptors */"

429

print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

430

for item in table:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

431

print >>fp, " {%d, %d, %d, %d, %d, %d}," % item

432

print >>fp, "};"

433

print >>fp

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

434

435

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

436

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

437

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

438

print >>fp, "/* type indexes */"

439

print >>fp, "#define SHIFT", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

440

Array("index1", index1).dump(fp, trace)

441

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

442

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

443

fp.close()

444

445

# --------------------------------------------------------------------

446

# unicode name database

447

448

def makeunicodename(unicode, trace):

449

450

FILE = "Modules/unicodename_db.h"

451

452

print "--- Preparing", FILE, "..."

453

454

# collect names

455

names = [None] * len(unicode.chars)

456

457

for char in unicode.chars:

458

record = unicode.table[char]

459

if record:

460

name = record[1].strip()

461

if name and name[0] != "<":

462

names[char] = name + chr(0)

463

464

print len(filter(lambda n: n is not None, names)), "distinct names"

465

466

# collect unique words from names (note that we differ between

467

# words inside a sentence, and words ending a sentence. the

468

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

484

485

print n, "words in text;", b, "bytes"

486

487

wordlist = words.items()

488

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

489

# sort on falling frequency, then by name

490

def cmpwords((aword, alist),(bword, blist)):

491

r = -cmp(len(alist),len(blist))

492

if r:

493

return r

494

return cmp(aword, bword)

495

wordlist.sort(cmpwords)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

496

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

497

# figure out how many phrasebook escapes we need

498

escapes = 0

499

while escapes * 256 < len(wordlist):

500

escapes = escapes + 1

501

print escapes, "escapes"

502

503

short = 256 - escapes

assert short > 0

print short, "short indexes in lexicon"

508

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

509

# statistics

510

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

511

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

512

n = n + len(wordlist[i][1])

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

513

print n, "short indexes in phrasebook"

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

514

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

515

# pick the most commonly used words, and sort the rest on falling

516

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

517

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

518

wordlist, wordtail = wordlist[:short], wordlist[short:]

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

519

wordtail.sort(lambda a, b: len(b[0])-len(a[0]))

520

wordlist.extend(wordtail)

521

522

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

529

offset = 0

530

for w, x in wordlist:

531

# encoding: bit 7 indicates last character in word (chr(128)

532

# indicates the last character in an entire string)

533

ww = w[:-1] + chr(ord(w[-1])+128)

534

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

535

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

536

if o < 0:

537

o = offset

538

lexicon = lexicon + ww

539

offset = offset + len(w)

540

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

541

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

542

543

lexicon = map(ord, lexicon)

544

545

# generate phrasebook from names and lexicon

546

phrasebook = [0]

547

phrasebook_offset = [0] * len(unicode.chars)

548

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

553

for w in w:

554

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

555

if i < short:

556

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

557

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

558

# store as two bytes

559

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

560

phrasebook.append(i&255)

561

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

562

assert getsize(phrasebook) == 1

563

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

564

#

565

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

570

record = unicode.table[char]

571

if record:

572

name = record[1].strip()

573

if name and name[0] != "<":

574

data.append((name, char))

575

576

# the magic number 47 was chosen to minimize the number of

577

# collisions on the current data set. if you like, change it

578

# and see what happens...

579

580

codehash = Hash("code", data, 47)

581

582

print "--- Writing", FILE, "..."

583

584

fp = open(FILE, "w")

585

print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)

586

print >>fp

587

print >>fp, "#define NAME_MAXLEN", 256

588

print >>fp

589

print >>fp, "/* lexicon */"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

590

Array("lexicon", lexicon).dump(fp, trace)

591

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

592

593

# split decomposition index table

594

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

595

596

print >>fp, "/* code->name phrasebook */"

597

print >>fp, "#define phrasebook_shift", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

598

print >>fp, "#define phrasebook_short", short

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

599

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

600

Array("phrasebook", phrasebook).dump(fp, trace)

601

Array("phrasebook_offset1", offset1).dump(fp, trace)

602

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

603

604

print >>fp, "/* name->code dictionary */"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

605

codehash.dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

fp.close()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

609

610

def merge_old_version(version, new, old):

611

# Changes to exclusion file not implemented yet

612

if old.exclusions != new.exclusions:

613

raise NotImplementedError, "exclusions differ"

614

615

# In these change records, 0xFF means "no change"

616

bidir_changes = [0xFF]*0x110000

617

category_changes = [0xFF]*0x110000

618

decimal_changes = [0xFF]*0x110000

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame^]

619

mirrored_changes = [0xFF]*0x110000

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

620

# In numeric data, 0 means "no change",

621

# -1 means "did not have a numeric value

622

numeric_changes = [0] * 0x110000

623

# normalization_changes is a list of key-value pairs

624

normalization_changes = []

625

for i in range(0x110000):

626

if new.table[i] is None:

627

# Characters unassigned in the new version ought to

628

# be unassigned in the old one

629

assert old.table[i] is None

630

continue

631

# check characters unassigned in the old version

632

if old.table[i] is None:

633

# category 0 is "unassigned"

634

category_changes[i] = 0

635

continue

636

# check characters that differ

637

if old.table[i] != new.table[i]:

638

for k in range(len(old.table[i])):

639

if old.table[i][k] != new.table[i][k]:

640

value = old.table[i][k]

641

if k == 2:

642

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

643

category_changes[i] = CATEGORY_NAMES.index(value)

644

elif k == 4:

645

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

646

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

647

elif k == 5:

648

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

649

# We assume that all normalization changes are in 1:1 mappings

650

assert " " not in value

651

normalization_changes.append((i, value))

652

elif k == 6:

653

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

654

# we only support changes where the old value is a single digit

655

assert value in "0123456789"

656

decimal_changes[i] = int(value)

657

elif k == 8:

658

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

659

# Since 0 encodes "no change", the old value is better not 0

660

assert value != "0" and value != "-1"

661

if not value:

662

numeric_changes[i] = -1

663

else:

664

assert re.match("^[0-9]+$", value)

665

numeric_changes[i] = int(value)

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame^]

666

elif k == 9:

667

if value == 'Y':

668

mirrored_changes[i] = '1'

669

else:

670

mirrored_changes[i] = '0'

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

671

elif k == 11:

672

# change to ISO comment, ignore

673

pass

674

elif k == 12:

675

# change to simple uppercase mapping; ignore

676

pass

677

elif k == 13:

678

# change to simple lowercase mapping; ignore

679

pass

680

elif k == 14:

681

# change to simple titlecase mapping; ignore

682

pass

683

else:

684

class Difference(Exception):pass

685

raise Difference, (hex(i), k, old.table[i], new.table[i])

686

new.changed.append((version, zip(bidir_changes, category_changes,

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame^]

687

decimal_changes, mirrored_changes,

688

numeric_changes),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

689

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

690

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

691

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

692

# --------------------------------------------------------------------

693

# the following support code is taken from the unidb utilities

694

695

696

# load a unicode-data file from disk

697

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

698

import sys

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

class UnicodeData:

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

702

def __init__(self, filename, exclusions, eastasianwidth, expand=1):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

703

self.changed = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

704

file = open(filename)

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

705

table = [None] * 0x110000

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

while 1:

s = file.readline()

if not s:

break

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

710

s = s.strip().split(";")

711

char = int(s[0], 16)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

712

table[char] = s

713

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

714

# expand first-last ranges

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

715

if expand:

716

field = None

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

717

for i in range(0, 0x110000):

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

718

s = table[i]

719

if s:

720

if s[1][-6:] == "First>":

721

s[1] = ""

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

722

field = s

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

723

elif s[1][-5:] == "Last>":

724

s[1] = ""

725

field = None

726

elif field:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

727

f2 = field[:]

728

f2[0] = "%X" % i

729

table[i] = f2

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

730

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

731

# public attributes

732

self.filename = filename

733

self.table = table

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

734

self.chars = range(0x110000) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

735

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

736

file = open(exclusions)

self.exclusions = {}

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

char = int(s.split()[0],16)

745

self.exclusions[char] = 1

746

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

747

widths = [None] * 0x110000

748

for s in open(eastasianwidth):

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

s = s.split()[0].split(';')

755

if '..' in s[0]:

756

first, last = [int(c, 16) for c in s[0].split('..')]

757

chars = range(first, last+1)

758

else:

759

chars = [int(s[0], 16)]

760

for char in chars:

761

widths[char] = s[1]

762

for i in range(0, 0x110000):

763

if table[i] is not None:

764

table[i].append(widths[i])

765

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

766

def uselatin1(self):

767

# restrict character range to ISO Latin 1

768

self.chars = range(256)

769

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

770

# hash table tools

771

772

# this is a straight-forward reimplementation of Python's built-in

773

# dictionary type, using a static data structure, and a custom string

774

# hash algorithm.

775

776

def myhash(s, magic):

777

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

778

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

779

h = (h * magic) + c

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

780

ix = h & 0xff000000L

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

781

if ix:

782

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

return h

SIZES = [

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

787

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

788

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

789

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

]

class Hash:

def __init__(self, name, data, magic):

794

# turn a (key, value) list into a static hash table structure

795

796

# determine table size

797

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

raise AssertionError, "ran out of polynominals"

803

804

print size, "slots in hash table"

805

806

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

815

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

incr = (h ^ (h >> 3)) & mask;

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

print n, "collisions"

837

self.collisions = n

838

839

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

849

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

850

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

851

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

852

file.write("#define %s_magic %d\n" % (self.name, self.magic))

853

file.write("#define %s_size %d\n" % (self.name, self.size))

854

file.write("#define %s_poly %d\n" % (self.name, self.poly))

855

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

856

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

864

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

865

# write data to file, as a C array

866

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

867

if trace:

868

print >>sys.stderr, self.name+":", size*len(self.data), "bytes"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

869

file.write("static ")

870

if size == 1:

871

file.write("unsigned char")

872

elif size == 2:

873

file.write("unsigned short")

874

else:

875

file.write("unsigned int")

876

file.write(" " + self.name + "[] = {\n")

877

if self.data:

878

s = " "

879

for item in self.data:

880

i = str(item) + ", "

881

if len(s) + len(i) > 78:

file.write(s + "\n")

s = " " + i

else:

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

886

if s.strip():

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

file.write(s + "\n")

file.write("};\n\n")

def getsize(data):

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

900

def splitbins(t, trace=0):

901

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

902

903

t is a sequence of ints. This function can be useful to save space if

904

many of the ints are the same. t1 and t2 are lists of ints, and shift

905

is an int, chosen to minimize the combined size of t1 and t2 (in C

906

code), and where for each i in range(len(t)),

907

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

908

where mask is a bitmask isolating the last "shift" bits.

909

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

910

If optional arg trace is non-zero (default zero), progress info

911

is printed to sys.stderr. The higher the value, the more info

912

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

"""

import sys

if trace:

def dump(t1, t2, shift, bytes):

918

print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (

919

len(t1), len(t2), shift, bytes)

920

print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \

921

"bytes"

922

n = len(t)-1 # last valid index

923

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

bytes = sys.maxint # smallest total size so far

930

t = tuple(t) # so slices can be dict keys

931

for shift in range(maxshift + 1):

932

t1 = []

933

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

934

size = 2**shift

935

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

936

for i in range(0, len(t), size):

937

bin = t[i:i+size]

938

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

939

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

940

index = len(t2)

941

bincache[bin] = index

942

t2.extend(bin)

943

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

944

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

945

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

946

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

947

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

948

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

949

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

950

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

951

t1, t2, shift = best

952

if trace:

953

print >>sys.stderr, "Best:",

954

dump(t1, t2, shift, bytes)

955

if __debug__:

956

# exhaustively verify that the decomposition is correct

957

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

958

for i in xrange(len(t)):

959

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

960

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

961

962

if __name__ == "__main__":

Fredrik Lundh