Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython2

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

4

# this script converts a unicode 3.2 database file to

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

5

# Modules/unicodedata_db.h, Modules/unicodename_db.h,

6

# and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

7

#

8

# history:

9

# 2000-09-24 fl created (based on bits and pieces from unidb)

10

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

11

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

12

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

13

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

14

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

15

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

16

# 2002-09-11 wd use string methods

17

# 2002-10-18 mvl update to Unicode 3.2

18

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

19

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

20

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

21

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

22

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

23

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

24

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

#

import sys

SCRIPT = sys.argv[0]

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

30

VERSION = "2.6"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

31

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

32

# The Unicode Database

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

33

UNIDATA_VERSION = "5.1.0"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

34

UNICODE_DATA = "UnicodeData%s.txt"

35

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

36

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

37

38

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

39

40

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

41

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

42

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

43

"So" ]

44

45

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

46

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

47

"ON" ]

48

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

49

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

50

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

51

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

56

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

57

SPACE_MASK = 0x20

58

TITLE_MASK = 0x40

59

UPPER_MASK = 0x80

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

60

NODELTA_MASK = 0x100

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

61

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

62

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

63

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

64

print "--- Reading", UNICODE_DATA % "", "..."

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

65

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

66

version = ""

67

unicode = UnicodeData(UNICODE_DATA % version,

68

COMPOSITION_EXCLUSIONS % version,

69

EASTASIAN_WIDTH % version)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

70

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

71

print len(filter(None, unicode.table)), "characters"

72

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

73

for version in old_versions:

74

print "--- Reading", UNICODE_DATA % ("-"+version), "..."

75

old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),

76

COMPOSITION_EXCLUSIONS % ("-"+version),

77

EASTASIAN_WIDTH % ("-"+version))

78

print len(filter(None, old_unicode.table)), "characters"

79

merge_old_version(version, unicode, old_unicode)

80

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

81

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

82

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

83

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

84

85

# --------------------------------------------------------------------

86

# unicode character properties

87

88

def makeunicodedata(unicode, trace):

89

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

90

dummy = (0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

91

table = [dummy]

92

cache = {0: dummy}

93

index = [0] * len(unicode.chars)

94

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

95

FILE = "Modules/unicodedata_db.h"

96

97

print "--- Preparing", FILE, "..."

98

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

99

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

100

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

101

for char in unicode.chars:

102

record = unicode.table[char]

103

if record:

104

# extract database properties

105

category = CATEGORY_NAMES.index(record[2])

106

combining = int(record[3])

107

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

108

mirrored = record[9] == "Y"

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

109

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

110

item = (

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

111

category, combining, bidirectional, mirrored, eastasianwidth

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

112

)

113

# add entry to index and item tables

114

i = cache.get(item)

115

if i is None:

116

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

120

# 2) decomposition data

121

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

122

decomp_data = [0]

123

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

124

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

125

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

126

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

127

comp_pairs = []

128

comp_first = [None] * len(unicode.chars)

129

comp_last = [None] * len(unicode.chars)

130

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

131

for char in unicode.chars:

132

record = unicode.table[char]

133

if record:

134

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

135

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

136

if len(decomp) > 19:

137

raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

138

# prefix

139

if decomp[0][0] == "<":

140

prefix = decomp.pop(0)

141

else:

142

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

143

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

144

i = decomp_prefix.index(prefix)

145

except ValueError:

146

i = len(decomp_prefix)

147

decomp_prefix.append(prefix)

prefix = i

assert prefix < 256

# content

decomp = [prefix + (len(decomp)<<8)] +\

152

map(lambda s: int(s, 16), decomp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

153

# Collect NFC pairs

154

if not prefix and len(decomp) == 3 and \

155

char not in unicode.exclusions and \

156

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

161

try:

162

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

163

except ValueError:

164

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

165

decomp_data.extend(decomp)

166

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

167

else:

168

i = 0

169

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

170

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

171

f = l = 0

172

comp_first_ranges = []

173

comp_last_ranges = []

174

prev_f = prev_l = None

175

for i in unicode.chars:

176

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

182

prev_f = prev_f[0],i

183

else:

184

comp_first_ranges.append(prev_f)

185

prev_f = (i,i)

186

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

192

prev_l = prev_l[0],i

193

else:

194

comp_last_ranges.append(prev_l)

195

prev_l = (i,i)

196

comp_first_ranges.append(prev_f)

197

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

202

for f,l,char in comp_pairs:

203

f = comp_first[f]

204

l = comp_last[l]

205

comp_data[f*total_last+l] = char

206

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

207

print len(table), "unique properties"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

208

print len(decomp_prefix), "unique decomposition prefixes"

209

print len(decomp_data), "unique decomposition entries:",

210

print decomp_size, "bytes"

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

211

print total_first, "first characters in NFC"

212

print total_last, "last characters in NFC"

213

print len(comp_pairs), "NFC pairs"

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

214

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

215

print "--- Writing", FILE, "..."

216

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

217

fp = open(FILE, "w")

218

print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)

219

print >>fp

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

220

print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

221

print >>fp, "/* a list of unique database records */"

222

print >>fp, \

223

"const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

224

for item in table:

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

225

print >>fp, " {%d, %d, %d, %d, %d}," % item

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

226

print >>fp, "};"

227

print >>fp

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

228

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

229

print >>fp, "/* Reindexing of NFC first characters. */"

230

print >>fp, "#define TOTAL_FIRST",total_first

231

print >>fp, "#define TOTAL_LAST",total_last

232

print >>fp, "struct reindex{int start;short count,index;};"

Martin v. Löwis

111c180

2008-06-13 07:47:47 +0000

[diff] [blame]

233

print >>fp, "static struct reindex nfc_first[] = {"

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

234

for start,end in comp_first_ranges:

235

print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])

236

print >>fp," {0,0,0}"

237

print >>fp,"};\n"

Martin v. Löwis

111c180

2008-06-13 07:47:47 +0000

[diff] [blame]

238

print >>fp, "static struct reindex nfc_last[] = {"

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

239

for start,end in comp_last_ranges:

240

print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])

241

print >>fp," {0,0,0}"

242

print >>fp,"};\n"

243

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

244

# FIXME: <fl> the following tables could be made static, and

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

245

# the support code moved into unicodedatabase.c

246

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

247

print >>fp, "/* string literals */"

248

print >>fp, "const char *_PyUnicode_CategoryNames[] = {"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

249

for name in CATEGORY_NAMES:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

250

print >>fp, " \"%s\"," % name

251

print >>fp, " NULL"

252

print >>fp, "};"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

253

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

254

print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

255

for name in BIDIRECTIONAL_NAMES:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

256

print >>fp, " \"%s\"," % name

257

print >>fp, " NULL"

258

print >>fp, "};"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

259

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

260

print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"

261

for name in EASTASIANWIDTH_NAMES:

262

print >>fp, " \"%s\"," % name

print >>fp, " NULL"

print >>fp, "};"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

266

print >>fp, "static const char *decomp_prefix[] = {"

267

for name in decomp_prefix:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

268

print >>fp, " \"%s\"," % name

269

print >>fp, " NULL"

270

print >>fp, "};"

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

271

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

272

# split record index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

273

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

274

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

275

print >>fp, "/* index tables for the database records */"

276

print >>fp, "#define SHIFT", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

277

Array("index1", index1).dump(fp, trace)

278

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

279

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

280

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

281

index1, index2, shift = splitbins(decomp_index, trace)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

282

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

283

print >>fp, "/* decomposition data */"

284

Array("decomp_data", decomp_data).dump(fp, trace)

285

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

286

print >>fp, "/* index tables for the decomposition data */"

287

print >>fp, "#define DECOMP_SHIFT", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

288

Array("decomp_index1", index1).dump(fp, trace)

289

Array("decomp_index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

290

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

291

index, index2, shift = splitbins(comp_data, trace)

292

print >>fp, "/* NFC pairs */"

293

print >>fp, "#define COMP_SHIFT", shift

294

Array("comp_index", index).dump(fp, trace)

295

Array("comp_data", index2).dump(fp, trace)

296

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

297

# Generate delta tables for old versions

298

for version, table, normalization in unicode.changed:

299

cversion = version.replace(".","_")

300

records = [table[0]]

301

cache = {table[0]:0}

302

index = [0] * len(table)

303

for i, record in enumerate(table):

304

try:

305

index[i] = cache[record]

306

except KeyError:

307

index[i] = cache[record] = len(records)

308

records.append(record)

309

index1, index2, shift = splitbins(index, trace)

310

print >>fp, "static const change_record change_records_%s[] = {" % cversion

311

for record in records:

312

print >>fp, "\t{ %s }," % ", ".join(map(str,record))

313

print >>fp, "};"

314

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

315

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

316

print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion

317

print >>fp, "{"

318

print >>fp, "\tint index;"

319

print >>fp, "\tif (n >= 0x110000) index = 0;"

320

print >>fp, "\telse {"

321

print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)

322

print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \

323

(cversion, shift, ((1<<shift)-1))

324

print >>fp, "\t}"

325

print >>fp, "\treturn change_records_%s+index;" % cversion

326

print >>fp, "}\n"

327

print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion

328

print >>fp, "{"

329

print >>fp, "\tswitch(n) {"

330

for k, v in normalization:

331

print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)

332

print >>fp, "\tdefault: return 0;"

333

print >>fp, "\t}\n}\n"

334

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

335

fp.close()

336

337

# --------------------------------------------------------------------

338

# unicode character type tables

339

340

def makeunicodetype(unicode, trace):

341

342

FILE = "Objects/unicodetype_db.h"

343

344

print "--- Preparing", FILE, "..."

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

345

346

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

347

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

348

table = [dummy]

349

cache = {0: dummy}

350

index = [0] * len(unicode.chars)

351

352

for char in unicode.chars:

353

record = unicode.table[char]

354

if record:

355

# extract database properties

356

category = record[2]

357

bidirectional = record[4]

358

flags = 0

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

359

delta = True

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

360

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

361

flags |= ALPHA_MASK

362

if category == "Ll":

363

flags |= LOWER_MASK

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

364

if category == "Zl" or bidirectional == "B":

365

flags |= LINEBREAK_MASK

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

366

if category == "Zs" or bidirectional in ("WS", "B", "S"):

367

flags |= SPACE_MASK

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

368

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

369

flags |= TITLE_MASK

370

if category == "Lu":

371

flags |= UPPER_MASK

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

372

# use delta predictor for upper/lower/title if it fits

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

373

if record[12]:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame^]

374

upper = int(record[12], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

375

else:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame^]

376

upper = char

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

377

if record[13]:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame^]

378

lower = int(record[13], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

379

else:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame^]

380

lower = char

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

381

if record[14]:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame^]

382

title = int(record[14], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

383

else:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame^]

384

# UCD.html says that a missing title char means that

385

# it defaults to the uppercase character, not to the

386

# character itself. Apparently, in the current UCD (5.x)

387

# this feature is never used

388

title = upper

389

upper_d = upper - char

390

lower_d = lower - char

391

title_d = title - char

392

if -32768 <= upper_d <= 32767 and \

393

-32768 <= lower_d <= 32767 and \

394

-32768 <= title_d <= 32767:

395

# use deltas

396

upper = upper_d & 0xffff

397

lower = lower_d & 0xffff

398

title = title_d & 0xffff

399

else:

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

400

flags |= NODELTA_MASK

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

401

# decimal digit, integer digit

402

decimal = 0

403

if record[6]:

404

flags |= DECIMAL_MASK

405

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

410

item = (

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

411

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

412

)

413

# add entry to index and item tables

414

i = cache.get(item)

415

if i is None:

416

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

420

print len(table), "unique character type entries"

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

421

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

422

print "--- Writing", FILE, "..."

423

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

424

fp = open(FILE, "w")

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

425

print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)

426

print >>fp

427

print >>fp, "/* a list of unique character type descriptors */"

428

print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

429

for item in table:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

430

print >>fp, " {%d, %d, %d, %d, %d, %d}," % item

431

print >>fp, "};"

432

print >>fp

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

433

434

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

435

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

436

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

437

print >>fp, "/* type indexes */"

438

print >>fp, "#define SHIFT", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

439

Array("index1", index1).dump(fp, trace)

440

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

441

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

442

fp.close()

443

444

# --------------------------------------------------------------------

445

# unicode name database

446

447

def makeunicodename(unicode, trace):

448

449

FILE = "Modules/unicodename_db.h"

450

451

print "--- Preparing", FILE, "..."

452

453

# collect names

454

names = [None] * len(unicode.chars)

455

456

for char in unicode.chars:

457

record = unicode.table[char]

458

if record:

459

name = record[1].strip()

460

if name and name[0] != "<":

461

names[char] = name + chr(0)

462

463

print len(filter(lambda n: n is not None, names)), "distinct names"

464

465

# collect unique words from names (note that we differ between

466

# words inside a sentence, and words ending a sentence. the

467

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

483

484

print n, "words in text;", b, "bytes"

485

486

wordlist = words.items()

487

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

488

# sort on falling frequency, then by name

489

def cmpwords((aword, alist),(bword, blist)):

490

r = -cmp(len(alist),len(blist))

491

if r:

492

return r

493

return cmp(aword, bword)

494

wordlist.sort(cmpwords)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

495

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

496

# figure out how many phrasebook escapes we need

497

escapes = 0

498

while escapes * 256 < len(wordlist):

499

escapes = escapes + 1

500

print escapes, "escapes"

501

502

short = 256 - escapes

assert short > 0

print short, "short indexes in lexicon"

507

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

508

# statistics

509

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

510

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

511

n = n + len(wordlist[i][1])

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

512

print n, "short indexes in phrasebook"

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

513

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

514

# pick the most commonly used words, and sort the rest on falling

515

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

516

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

517

wordlist, wordtail = wordlist[:short], wordlist[short:]

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

518

wordtail.sort(lambda a, b: len(b[0])-len(a[0]))

519

wordlist.extend(wordtail)

520

521

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

528

offset = 0

529

for w, x in wordlist:

530

# encoding: bit 7 indicates last character in word (chr(128)

531

# indicates the last character in an entire string)

532

ww = w[:-1] + chr(ord(w[-1])+128)

533

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

534

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

535

if o < 0:

536

o = offset

537

lexicon = lexicon + ww

538

offset = offset + len(w)

539

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

540

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

541

542

lexicon = map(ord, lexicon)

543

544

# generate phrasebook from names and lexicon

545

phrasebook = [0]

546

phrasebook_offset = [0] * len(unicode.chars)

547

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

552

for w in w:

553

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

554

if i < short:

555

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

556

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

557

# store as two bytes

558

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

559

phrasebook.append(i&255)

560

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

561

assert getsize(phrasebook) == 1

562

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

563

#

564

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

569

record = unicode.table[char]

570

if record:

571

name = record[1].strip()

572

if name and name[0] != "<":

573

data.append((name, char))

574

575

# the magic number 47 was chosen to minimize the number of

576

# collisions on the current data set. if you like, change it

577

# and see what happens...

578

579

codehash = Hash("code", data, 47)

580

581

print "--- Writing", FILE, "..."

582

583

fp = open(FILE, "w")

584

print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)

585

print >>fp

586

print >>fp, "#define NAME_MAXLEN", 256

587

print >>fp

588

print >>fp, "/* lexicon */"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

589

Array("lexicon", lexicon).dump(fp, trace)

590

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

591

592

# split decomposition index table

593

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

594

595

print >>fp, "/* code->name phrasebook */"

596

print >>fp, "#define phrasebook_shift", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

597

print >>fp, "#define phrasebook_short", short

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

598

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

599

Array("phrasebook", phrasebook).dump(fp, trace)

600

Array("phrasebook_offset1", offset1).dump(fp, trace)

601

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

602

603

print >>fp, "/* name->code dictionary */"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

604

codehash.dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

fp.close()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

608

609

def merge_old_version(version, new, old):

610

# Changes to exclusion file not implemented yet

611

if old.exclusions != new.exclusions:

612

raise NotImplementedError, "exclusions differ"

613

614

# In these change records, 0xFF means "no change"

615

bidir_changes = [0xFF]*0x110000

616

category_changes = [0xFF]*0x110000

617

decimal_changes = [0xFF]*0x110000

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

618

mirrored_changes = [0xFF]*0x110000

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

619

# In numeric data, 0 means "no change",

620

# -1 means "did not have a numeric value

621

numeric_changes = [0] * 0x110000

622

# normalization_changes is a list of key-value pairs

623

normalization_changes = []

624

for i in range(0x110000):

625

if new.table[i] is None:

626

# Characters unassigned in the new version ought to

627

# be unassigned in the old one

628

assert old.table[i] is None

629

continue

630

# check characters unassigned in the old version

631

if old.table[i] is None:

632

# category 0 is "unassigned"

633

category_changes[i] = 0

634

continue

635

# check characters that differ

636

if old.table[i] != new.table[i]:

637

for k in range(len(old.table[i])):

638

if old.table[i][k] != new.table[i][k]:

639

value = old.table[i][k]

640

if k == 2:

641

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

642

category_changes[i] = CATEGORY_NAMES.index(value)

643

elif k == 4:

644

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

645

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

646

elif k == 5:

647

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

648

# We assume that all normalization changes are in 1:1 mappings

649

assert " " not in value

650

normalization_changes.append((i, value))

651

elif k == 6:

652

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

653

# we only support changes where the old value is a single digit

654

assert value in "0123456789"

655

decimal_changes[i] = int(value)

656

elif k == 8:

657

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

658

# Since 0 encodes "no change", the old value is better not 0

659

assert value != "0" and value != "-1"

660

if not value:

661

numeric_changes[i] = -1

662

else:

663

assert re.match("^[0-9]+$", value)

664

numeric_changes[i] = int(value)

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

665

elif k == 9:

666

if value == 'Y':

667

mirrored_changes[i] = '1'

668

else:

669

mirrored_changes[i] = '0'

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

670

elif k == 11:

671

# change to ISO comment, ignore

672

pass

673

elif k == 12:

674

# change to simple uppercase mapping; ignore

675

pass

676

elif k == 13:

677

# change to simple lowercase mapping; ignore

678

pass

679

elif k == 14:

680

# change to simple titlecase mapping; ignore

681

pass

682

else:

683

class Difference(Exception):pass

684

raise Difference, (hex(i), k, old.table[i], new.table[i])

685

new.changed.append((version, zip(bidir_changes, category_changes,

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

686

decimal_changes, mirrored_changes,

687

numeric_changes),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

688

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

689

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

690

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

691

# --------------------------------------------------------------------

692

# the following support code is taken from the unidb utilities

693

694

695

# load a unicode-data file from disk

696

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

697

import sys

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

class UnicodeData:

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

701

def __init__(self, filename, exclusions, eastasianwidth, expand=1):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

702

self.changed = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

703

file = open(filename)

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

704

table = [None] * 0x110000

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

while 1:

s = file.readline()

if not s:

break

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

709

s = s.strip().split(";")

710

char = int(s[0], 16)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

711

table[char] = s

712

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

713

# expand first-last ranges

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

714

if expand:

715

field = None

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

716

for i in range(0, 0x110000):

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

717

s = table[i]

718

if s:

719

if s[1][-6:] == "First>":

720

s[1] = ""

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

721

field = s

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

722

elif s[1][-5:] == "Last>":

723

s[1] = ""

724

field = None

725

elif field:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

726

f2 = field[:]

727

f2[0] = "%X" % i

728

table[i] = f2

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

729

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

730

# public attributes

731

self.filename = filename

732

self.table = table

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

733

self.chars = range(0x110000) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

734

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

735

file = open(exclusions)

self.exclusions = {}

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

char = int(s.split()[0],16)

744

self.exclusions[char] = 1

745

Hye-Shik Chang

974ed7c

2004-06-02 16:49:17 +0000

[diff] [blame]

746

widths = [None] * 0x110000

747

for s in open(eastasianwidth):

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

s = s.split()[0].split(';')

754

if '..' in s[0]:

755

first, last = [int(c, 16) for c in s[0].split('..')]

756

chars = range(first, last+1)

757

else:

758

chars = [int(s[0], 16)]

759

for char in chars:

760

widths[char] = s[1]

761

for i in range(0, 0x110000):

762

if table[i] is not None:

763

table[i].append(widths[i])

764

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

765

def uselatin1(self):

766

# restrict character range to ISO Latin 1

767

self.chars = range(256)

768

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

769

# hash table tools

770

771

# this is a straight-forward reimplementation of Python's built-in

772

# dictionary type, using a static data structure, and a custom string

773

# hash algorithm.

774

775

def myhash(s, magic):

776

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

777

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

778

h = (h * magic) + c

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

779

ix = h & 0xff000000L

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

780

if ix:

781

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

return h

SIZES = [

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

786

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

787

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

788

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

]

class Hash:

def __init__(self, name, data, magic):

793

# turn a (key, value) list into a static hash table structure

794

795

# determine table size

796

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

raise AssertionError, "ran out of polynominals"

802

803

print size, "slots in hash table"

804

805

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

814

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

incr = (h ^ (h >> 3)) & mask;

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

print n, "collisions"

836

self.collisions = n

837

838

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

848

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

849

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

850

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

851

file.write("#define %s_magic %d\n" % (self.name, self.magic))

852

file.write("#define %s_size %d\n" % (self.name, self.size))

853

file.write("#define %s_poly %d\n" % (self.name, self.poly))

854

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

855

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

863

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

864

# write data to file, as a C array

865

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

866

if trace:

867

print >>sys.stderr, self.name+":", size*len(self.data), "bytes"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

868

file.write("static ")

869

if size == 1:

870

file.write("unsigned char")

871

elif size == 2:

872

file.write("unsigned short")

873

else:

874

file.write("unsigned int")

875

file.write(" " + self.name + "[] = {\n")

876

if self.data:

877

s = " "

878

for item in self.data:

879

i = str(item) + ", "

880

if len(s) + len(i) > 78:

file.write(s + "\n")

s = " " + i

else:

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

885

if s.strip():

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

file.write(s + "\n")

file.write("};\n\n")

def getsize(data):

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

899

def splitbins(t, trace=0):

900

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

901

902

t is a sequence of ints. This function can be useful to save space if

903

many of the ints are the same. t1 and t2 are lists of ints, and shift

904

is an int, chosen to minimize the combined size of t1 and t2 (in C

905

code), and where for each i in range(len(t)),

906

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

907

where mask is a bitmask isolating the last "shift" bits.

908

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

909

If optional arg trace is non-zero (default zero), progress info

910

is printed to sys.stderr. The higher the value, the more info

911

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

"""

import sys

if trace:

def dump(t1, t2, shift, bytes):

917

print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (

918

len(t1), len(t2), shift, bytes)

919

print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \

920

"bytes"

921

n = len(t)-1 # last valid index

922

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

bytes = sys.maxint # smallest total size so far

929

t = tuple(t) # so slices can be dict keys

930

for shift in range(maxshift + 1):

931

t1 = []

932

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

933

size = 2**shift

934

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

935

for i in range(0, len(t), size):

936

bin = t[i:i+size]

937

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

938

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

939

index = len(t2)

940

bincache[bin] = index

941

t2.extend(bin)

942

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

943

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

944

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

945

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

946

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

947

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

948

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

949

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

950

t1, t2, shift = best

951

if trace:

952

print >>sys.stderr, "Best:",

953

dump(t1, t2, shift, bytes)

954

if __debug__:

955

# exhaustively verify that the decomposition is correct

956

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

957

for i in xrange(len(t)):

958

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

959

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

960

961

if __name__ == "__main__":

Fredrik Lundh