Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython2

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

4

# this script converts a unicode 3.2 database file to

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

5

# Modules/unicodedata_db.h, Modules/unicodename_db.h,

6

# and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

7

#

8

# history:

9

# 2000-09-24 fl created (based on bits and pieces from unidb)

10

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

11

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

12

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

13

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

14

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

15

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

16

# 2002-09-11 wd use string methods

17

# 2002-10-18 mvl update to Unicode 3.2

18

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

19

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

20

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

21

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

22

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

23

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

24

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

#

import sys

SCRIPT = sys.argv[0]

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

30

VERSION = "2.6"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

31

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

32

# The Unicode Database

Florent Xicluna

2e0a53f

2010-03-18 21:50:06 +0000

[diff] [blame]

33

UNIDATA_VERSION = "5.2.0"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

34

UNICODE_DATA = "UnicodeData%s.txt"

35

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

36

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

37

UNIHAN = "Unihan%s.txt"

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

38

DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

Florent Xicluna

2010-03-30 08:24:06 +0000

[diff] [blame]

39

LINE_BREAK = "LineBreak%s.txt"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

40

41

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

42

43

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

44

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

45

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

46

"So" ]

47

48

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

49

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

50

"ON" ]

51

Hye-Shik Chang

e9ddfbb

2004-08-04 07:38:35 +0000

[diff] [blame]

52

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

53

Florent Xicluna

2010-03-30 08:24:06 +0000

[diff] [blame]

54

MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]

55

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

56

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

61

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

62

SPACE_MASK = 0x20

63

TITLE_MASK = 0x40

64

UPPER_MASK = 0x80

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

65

NODELTA_MASK = 0x100

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

66

NUMERIC_MASK = 0x200

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

67

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

68

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

69

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

70

print "--- Reading", UNICODE_DATA % "", "..."

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

71

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

72

version = ""

73

unicode = UnicodeData(UNICODE_DATA % version,

74

COMPOSITION_EXCLUSIONS % version,

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

75

EASTASIAN_WIDTH % version,

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

76

UNIHAN % version,

Florent Xicluna

2010-03-30 08:24:06 +0000

[diff] [blame]

77

DERIVEDNORMALIZATION_PROPS % version,

78

LINE_BREAK % version)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

79

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

80

print len(filter(None, unicode.table)), "characters"

81

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

82

for version in old_versions:

83

print "--- Reading", UNICODE_DATA % ("-"+version), "..."

84

old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),

85

COMPOSITION_EXCLUSIONS % ("-"+version),

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

86

EASTASIAN_WIDTH % ("-"+version),

87

UNIHAN % ("-"+version))

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

88

print len(filter(None, old_unicode.table)), "characters"

89

merge_old_version(version, unicode, old_unicode)

90

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

91

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

92

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

93

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

94

95

# --------------------------------------------------------------------

96

# unicode character properties

97

98

def makeunicodedata(unicode, trace):

99

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

100

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

101

table = [dummy]

102

cache = {0: dummy}

103

index = [0] * len(unicode.chars)

104

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

105

FILE = "Modules/unicodedata_db.h"

106

107

print "--- Preparing", FILE, "..."

108

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

109

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

110

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

111

for char in unicode.chars:

112

record = unicode.table[char]

113

if record:

114

# extract database properties

115

category = CATEGORY_NAMES.index(record[2])

116

combining = int(record[3])

117

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

118

mirrored = record[9] == "Y"

Hye-Shik Chang

e9ddfbb

2004-08-04 07:38:35 +0000

[diff] [blame]

119

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Florent Xicluna

2010-03-30 08:24:06 +0000

[diff] [blame]

120

normalizationquickcheck = record[17]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

121

item = (

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

122

category, combining, bidirectional, mirrored, eastasianwidth,

123

normalizationquickcheck

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

124

)

125

# add entry to index and item tables

126

i = cache.get(item)

127

if i is None:

128

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

132

# 2) decomposition data

133

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

134

decomp_data = [0]

135

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

136

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

137

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

138

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

139

comp_pairs = []

140

comp_first = [None] * len(unicode.chars)

141

comp_last = [None] * len(unicode.chars)

142

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

143

for char in unicode.chars:

144

record = unicode.table[char]

145

if record:

146

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

147

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

148

if len(decomp) > 19:

149

raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

150

# prefix

151

if decomp[0][0] == "<":

152

prefix = decomp.pop(0)

153

else:

154

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

155

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

156

i = decomp_prefix.index(prefix)

157

except ValueError:

158

i = len(decomp_prefix)

159

decomp_prefix.append(prefix)

160

prefix = i

161

assert prefix < 256

162

# content

Florent Xicluna

2010-03-15 14:00:58 +0000

[diff] [blame]

163

decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

164

# Collect NFC pairs

165

if not prefix and len(decomp) == 3 and \

166

char not in unicode.exclusions and \

167

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

172

try:

173

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

174

except ValueError:

175

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

176

decomp_data.extend(decomp)

177

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

178

else:

179

i = 0

180

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

181

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

182

f = l = 0

183

comp_first_ranges = []

184

comp_last_ranges = []

185

prev_f = prev_l = None

186

for i in unicode.chars:

187

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

193

prev_f = prev_f[0],i

194

else:

195

comp_first_ranges.append(prev_f)

196

prev_f = (i,i)

197

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

203

prev_l = prev_l[0],i

204

else:

205

comp_last_ranges.append(prev_l)

206

prev_l = (i,i)

207

comp_first_ranges.append(prev_f)

208

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

213

for f,l,char in comp_pairs:

214

f = comp_first[f]

215

l = comp_last[l]

216

comp_data[f*total_last+l] = char

217

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

218

print len(table), "unique properties"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

219

print len(decomp_prefix), "unique decomposition prefixes"

220

print len(decomp_data), "unique decomposition entries:",

221

print decomp_size, "bytes"

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

222

print total_first, "first characters in NFC"

223

print total_last, "last characters in NFC"

224

print len(comp_pairs), "NFC pairs"

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

225

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

226

print "--- Writing", FILE, "..."

227

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

228

fp = open(FILE, "w")

229

print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)

230

print >>fp

Martin v. Löwis

2002-11-25 09:13:37 +0000

[diff] [blame]

231

print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

232

print >>fp, "/* a list of unique database records */"

233

print >>fp, \

234

"const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

235

for item in table:

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

236

print >>fp, " {%d, %d, %d, %d, %d, %d}," % item

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

237

print >>fp, "};"

238

print >>fp

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

239

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

240

print >>fp, "/* Reindexing of NFC first characters. */"

241

print >>fp, "#define TOTAL_FIRST",total_first

242

print >>fp, "#define TOTAL_LAST",total_last

243

print >>fp, "struct reindex{int start;short count,index;};"

Martin v. Löwis

111c180

2008-06-13 07:47:47 +0000

[diff] [blame]

244

print >>fp, "static struct reindex nfc_first[] = {"

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

245

for start,end in comp_first_ranges:

246

print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])

247

print >>fp," {0,0,0}"

248

print >>fp,"};\n"

Martin v. Löwis

111c180

2008-06-13 07:47:47 +0000

[diff] [blame]

249

print >>fp, "static struct reindex nfc_last[] = {"

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

250

for start,end in comp_last_ranges:

251

print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])

252

print >>fp," {0,0,0}"

253

print >>fp,"};\n"

254

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

255

# FIXME: <fl> the following tables could be made static, and

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

256

# the support code moved into unicodedatabase.c

257

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

258

print >>fp, "/* string literals */"

259

print >>fp, "const char *_PyUnicode_CategoryNames[] = {"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

260

for name in CATEGORY_NAMES:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

261

print >>fp, " \"%s\"," % name

262

print >>fp, " NULL"

263

print >>fp, "};"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

264

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

265

print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

266

for name in BIDIRECTIONAL_NAMES:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

267

print >>fp, " \"%s\"," % name

268

print >>fp, " NULL"

269

print >>fp, "};"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

270

Hye-Shik Chang

e9ddfbb

2004-08-04 07:38:35 +0000

[diff] [blame]

271

print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"

272

for name in EASTASIANWIDTH_NAMES:

273

print >>fp, " \"%s\"," % name

print >>fp, " NULL"

print >>fp, "};"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

277

print >>fp, "static const char *decomp_prefix[] = {"

278

for name in decomp_prefix:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

279

print >>fp, " \"%s\"," % name

280

print >>fp, " NULL"

281

print >>fp, "};"

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

282

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

283

# split record index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

284

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

285

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

286

print >>fp, "/* index tables for the database records */"

287

print >>fp, "#define SHIFT", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

288

Array("index1", index1).dump(fp, trace)

289

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

290

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

291

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

292

index1, index2, shift = splitbins(decomp_index, trace)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

293

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

294

print >>fp, "/* decomposition data */"

295

Array("decomp_data", decomp_data).dump(fp, trace)

296

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

297

print >>fp, "/* index tables for the decomposition data */"

298

print >>fp, "#define DECOMP_SHIFT", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

299

Array("decomp_index1", index1).dump(fp, trace)

300

Array("decomp_index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

301

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

302

index, index2, shift = splitbins(comp_data, trace)

303

print >>fp, "/* NFC pairs */"

304

print >>fp, "#define COMP_SHIFT", shift

305

Array("comp_index", index).dump(fp, trace)

306

Array("comp_data", index2).dump(fp, trace)

307

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

308

# Generate delta tables for old versions

309

for version, table, normalization in unicode.changed:

310

cversion = version.replace(".","_")

311

records = [table[0]]

312

cache = {table[0]:0}

313

index = [0] * len(table)

314

for i, record in enumerate(table):

315

try:

316

index[i] = cache[record]

317

except KeyError:

318

index[i] = cache[record] = len(records)

319

records.append(record)

320

index1, index2, shift = splitbins(index, trace)

321

print >>fp, "static const change_record change_records_%s[] = {" % cversion

322

for record in records:

323

print >>fp, "\t{ %s }," % ", ".join(map(str,record))

324

print >>fp, "};"

325

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

326

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

327

print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion

328

print >>fp, "{"

329

print >>fp, "\tint index;"

330

print >>fp, "\tif (n >= 0x110000) index = 0;"

331

print >>fp, "\telse {"

332

print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)

333

print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \

334

(cversion, shift, ((1<<shift)-1))

335

print >>fp, "\t}"

336

print >>fp, "\treturn change_records_%s+index;" % cversion

337

print >>fp, "}\n"

338

print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion

339

print >>fp, "{"

340

print >>fp, "\tswitch(n) {"

341

for k, v in normalization:

342

print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)

343

print >>fp, "\tdefault: return 0;"

344

print >>fp, "\t}\n}\n"

345

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

346

fp.close()

347

348

# --------------------------------------------------------------------

349

# unicode character type tables

350

351

def makeunicodetype(unicode, trace):

352

353

FILE = "Objects/unicodetype_db.h"

354

355

print "--- Preparing", FILE, "..."

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

356

357

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

358

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

359

table = [dummy]

360

cache = {0: dummy}

361

index = [0] * len(unicode.chars)

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

362

numeric = {}

363

spaces = []

364

linebreaks = []

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

365

366

for char in unicode.chars:

367

record = unicode.table[char]

368

if record:

369

# extract database properties

370

category = record[2]

371

bidirectional = record[4]

Florent Xicluna

2010-03-30 08:24:06 +0000

[diff] [blame]

372

properties = record[16]

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

373

flags = 0

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

374

delta = True

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

375

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

376

flags |= ALPHA_MASK

377

if category == "Ll":

378

flags |= LOWER_MASK

Florent Xicluna

2010-03-30 08:24:06 +0000

[diff] [blame]

379

if 'Line_Break' in properties or bidirectional == "B":

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

380

flags |= LINEBREAK_MASK

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

381

linebreaks.append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

382

if category == "Zs" or bidirectional in ("WS", "B", "S"):

383

flags |= SPACE_MASK

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

384

spaces.append(char)

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

385

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

386

flags |= TITLE_MASK

387

if category == "Lu":

388

flags |= UPPER_MASK

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

389

# use delta predictor for upper/lower/title if it fits

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

390

if record[12]:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

391

upper = int(record[12], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

392

else:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

393

upper = char

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

394

if record[13]:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

395

lower = int(record[13], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

396

else:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

397

lower = char

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

398

if record[14]:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

399

title = int(record[14], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

400

else:

Walter Dörwald

2009-04-25 14:03:16 +0000

[diff] [blame]

401

# UCD.html says that a missing title char means that

402

# it defaults to the uppercase character, not to the

403

# character itself. Apparently, in the current UCD (5.x)

404

# this feature is never used

405

title = upper

406

upper_d = upper - char

407

lower_d = lower - char

408

title_d = title - char

409

if -32768 <= upper_d <= 32767 and \

410

-32768 <= lower_d <= 32767 and \

411

-32768 <= title_d <= 32767:

412

# use deltas

413

upper = upper_d & 0xffff

414

lower = lower_d & 0xffff

415

title = title_d & 0xffff

416

else:

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

417

flags |= NODELTA_MASK

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

418

# decimal digit, integer digit

419

decimal = 0

420

if record[6]:

421

flags |= DECIMAL_MASK

422

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

427

if record[8]:

428

flags |= NUMERIC_MASK

429

numeric.setdefault(record[8], []).append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

430

item = (

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

431

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

432

)

433

# add entry to index and item tables

434

i = cache.get(item)

435

if i is None:

436

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

440

print len(table), "unique character type entries"

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

441

print sum(map(len, numeric.values())), "numeric code points"

442

print len(spaces), "whitespace code points"

443

print len(linebreaks), "linebreak code points"

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

444

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

445

print "--- Writing", FILE, "..."

446

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

447

fp = open(FILE, "w")

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

448

print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)

449

print >>fp

450

print >>fp, "/* a list of unique character type descriptors */"

451

print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

452

for item in table:

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

453

print >>fp, " {%d, %d, %d, %d, %d, %d}," % item

454

print >>fp, "};"

455

print >>fp

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

456

457

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

458

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

459

Fred Drake

2000-10-26 03:56:46 +0000

[diff] [blame]

460

print >>fp, "/* type indexes */"

461

print >>fp, "#define SHIFT", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

462

Array("index1", index1).dump(fp, trace)

463

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

464

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

465

# Generate code for _PyUnicode_ToNumeric()

Florent Xicluna

2010-03-15 14:00:58 +0000

[diff] [blame]

466

numeric_items = sorted(numeric.items())

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

467

print >>fp, '/* Returns the numeric value as double for Unicode characters'

468

print >>fp, ' * having this property, -1.0 otherwise.'

469

print >>fp, ' */'

470

print >>fp, 'double _PyUnicode_ToNumeric(Py_UNICODE ch)'

471

print >>fp, '{'

472

print >>fp, ' switch (ch) {'

473

for value, codepoints in numeric_items:

Amaury Forgeot d'Arc

5c92d43

2009-10-13 21:29:34 +0000

[diff] [blame]

474

# Turn text into float literals

475

parts = value.split('/')

476

parts = [repr(float(part)) for part in parts]

477

value = '/'.join(parts)

478

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

haswide = False

hasnonewide = False

codepoints.sort()

for codepoint in codepoints:

483

if codepoint < 0x10000:

484

hasnonewide = True

485

if codepoint >= 0x10000 and not haswide:

486

print >>fp, '#ifdef Py_UNICODE_WIDE'

487

haswide = True

488

print >>fp, ' case 0x%04X:' % (codepoint,)

489

if haswide and hasnonewide:

490

print >>fp, '#endif'

491

print >>fp, ' return (double) %s;' % (value,)

492

if haswide and not hasnonewide:

493

print >>fp, '#endif'

494

print >>fp,' }'

495

print >>fp,' return -1.0;'

print >>fp,'}'

print >>fp

# Generate code for _PyUnicode_IsWhitespace()

500

print >>fp, "/* Returns 1 for Unicode characters having the bidirectional"

501

print >>fp, " * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise."

502

print >>fp, " */"

503

print >>fp, 'int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)'

504

print >>fp, '{'

505

print >>fp, '#ifdef WANT_WCTYPE_FUNCTIONS'

506

print >>fp, ' return iswspace(ch);'

507

print >>fp, '#else'

508

print >>fp, ' switch (ch) {'

509

510

haswide = False

511

hasnonewide = False

Florent Xicluna

2010-03-15 14:00:58 +0000

[diff] [blame]

512

for codepoint in sorted(spaces):

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

513

if codepoint < 0x10000:

514

hasnonewide = True

515

if codepoint >= 0x10000 and not haswide:

516

print >>fp, '#ifdef Py_UNICODE_WIDE'

517

haswide = True

518

print >>fp, ' case 0x%04X:' % (codepoint,)

519

if haswide and hasnonewide:

520

print >>fp, '#endif'

521

print >>fp, ' return 1;'

522

if haswide and not hasnonewide:

print >>fp, '#endif'

print >>fp,' }'

print >>fp,' return 0;'

print >>fp, '#endif'

print >>fp,'}'

print >>fp

# Generate code for _PyUnicode_IsLinebreak()

Florent Xicluna

2010-03-30 08:24:06 +0000

[diff] [blame]

532

print >>fp, "/* Returns 1 for Unicode characters having the line break"

533

print >>fp, " * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional"

534

print >>fp, " * type 'B', 0 otherwise."

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

535

print >>fp, " */"

536

print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)'

537

print >>fp, '{'

538

print >>fp, ' switch (ch) {'

539

haswide = False

540

hasnonewide = False

Florent Xicluna

2010-03-15 14:00:58 +0000

[diff] [blame]

541

for codepoint in sorted(linebreaks):

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

542

if codepoint < 0x10000:

543

hasnonewide = True

544

if codepoint >= 0x10000 and not haswide:

545

print >>fp, '#ifdef Py_UNICODE_WIDE'

546

haswide = True

547

print >>fp, ' case 0x%04X:' % (codepoint,)

548

if haswide and hasnonewide:

549

print >>fp, '#endif'

550

print >>fp, ' return 1;'

551

if haswide and not hasnonewide:

print >>fp, '#endif'

print >>fp,' }'

print >>fp,' return 0;'

print >>fp,'}'

print >>fp

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

559

fp.close()

560

561

# --------------------------------------------------------------------

562

# unicode name database

563

564

def makeunicodename(unicode, trace):

565

566

FILE = "Modules/unicodename_db.h"

567

568

print "--- Preparing", FILE, "..."

569

570

# collect names

571

names = [None] * len(unicode.chars)

572

573

for char in unicode.chars:

574

record = unicode.table[char]

575

if record:

576

name = record[1].strip()

577

if name and name[0] != "<":

578

names[char] = name + chr(0)

579

580

print len(filter(lambda n: n is not None, names)), "distinct names"

581

582

# collect unique words from names (note that we differ between

583

# words inside a sentence, and words ending a sentence. the

584

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

600

601

print n, "words in text;", b, "bytes"

602

603

wordlist = words.items()

604

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

605

# sort on falling frequency, then by name

Florent Xicluna

2010-03-15 14:00:58 +0000

[diff] [blame]

606

def word_key(a):

607

aword, alist = a

608

return -len(alist), aword

609

wordlist.sort(key=word_key)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

610

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

611

# figure out how many phrasebook escapes we need

612

escapes = 0

613

while escapes * 256 < len(wordlist):

614

escapes = escapes + 1

615

print escapes, "escapes"

616

617

short = 256 - escapes

assert short > 0

print short, "short indexes in lexicon"

622

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

623

# statistics

624

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

625

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

626

n = n + len(wordlist[i][1])

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

627

print n, "short indexes in phrasebook"

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

628

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

629

# pick the most commonly used words, and sort the rest on falling

630

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

631

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

632

wordlist, wordtail = wordlist[:short], wordlist[short:]

Florent Xicluna

2010-03-15 14:00:58 +0000

[diff] [blame]

633

wordtail.sort(key=lambda a: a[0], reverse=True)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

634

wordlist.extend(wordtail)

635

636

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

643

offset = 0

644

for w, x in wordlist:

645

# encoding: bit 7 indicates last character in word (chr(128)

646

# indicates the last character in an entire string)

647

ww = w[:-1] + chr(ord(w[-1])+128)

648

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

649

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

650

if o < 0:

651

o = offset

652

lexicon = lexicon + ww

653

offset = offset + len(w)

654

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

655

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

656

657

lexicon = map(ord, lexicon)

658

659

# generate phrasebook from names and lexicon

660

phrasebook = [0]

661

phrasebook_offset = [0] * len(unicode.chars)

662

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

667

for w in w:

668

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

669

if i < short:

670

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

671

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

672

# store as two bytes

673

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

674

phrasebook.append(i&255)

675

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

676

assert getsize(phrasebook) == 1

677

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

678

#

679

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

684

record = unicode.table[char]

685

if record:

686

name = record[1].strip()

687

if name and name[0] != "<":

688

data.append((name, char))

689

690

# the magic number 47 was chosen to minimize the number of

691

# collisions on the current data set. if you like, change it

692

# and see what happens...

693

694

codehash = Hash("code", data, 47)

695

696

print "--- Writing", FILE, "..."

697

698

fp = open(FILE, "w")

699

print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)

700

print >>fp

701

print >>fp, "#define NAME_MAXLEN", 256

702

print >>fp

703

print >>fp, "/* lexicon */"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

704

Array("lexicon", lexicon).dump(fp, trace)

705

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

706

707

# split decomposition index table

708

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

709

710

print >>fp, "/* code->name phrasebook */"

711

print >>fp, "#define phrasebook_shift", shift

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

712

print >>fp, "#define phrasebook_short", short

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

713

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

714

Array("phrasebook", phrasebook).dump(fp, trace)

715

Array("phrasebook_offset1", offset1).dump(fp, trace)

716

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

717

718

print >>fp, "/* name->code dictionary */"

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

719

codehash.dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

fp.close()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

723

724

def merge_old_version(version, new, old):

725

# Changes to exclusion file not implemented yet

726

if old.exclusions != new.exclusions:

727

raise NotImplementedError, "exclusions differ"

728

729

# In these change records, 0xFF means "no change"

730

bidir_changes = [0xFF]*0x110000

731

category_changes = [0xFF]*0x110000

732

decimal_changes = [0xFF]*0x110000

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

733

mirrored_changes = [0xFF]*0x110000

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

734

# In numeric data, 0 means "no change",

735

# -1 means "did not have a numeric value

736

numeric_changes = [0] * 0x110000

737

# normalization_changes is a list of key-value pairs

738

normalization_changes = []

739

for i in range(0x110000):

740

if new.table[i] is None:

741

# Characters unassigned in the new version ought to

742

# be unassigned in the old one

743

assert old.table[i] is None

744

continue

745

# check characters unassigned in the old version

746

if old.table[i] is None:

747

# category 0 is "unassigned"

748

category_changes[i] = 0

749

continue

750

# check characters that differ

751

if old.table[i] != new.table[i]:

752

for k in range(len(old.table[i])):

753

if old.table[i][k] != new.table[i][k]:

754

value = old.table[i][k]

755

if k == 2:

756

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

757

category_changes[i] = CATEGORY_NAMES.index(value)

758

elif k == 4:

759

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

760

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

761

elif k == 5:

762

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

763

# We assume that all normalization changes are in 1:1 mappings

764

assert " " not in value

765

normalization_changes.append((i, value))

766

elif k == 6:

767

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

768

# we only support changes where the old value is a single digit

769

assert value in "0123456789"

770

decimal_changes[i] = int(value)

771

elif k == 8:

772

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

773

# Since 0 encodes "no change", the old value is better not 0

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

774

if not value:

775

numeric_changes[i] = -1

776

else:

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

777

numeric_changes[i] = float(value)

778

assert numeric_changes[i] not in (0, -1)

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

779

elif k == 9:

780

if value == 'Y':

781

mirrored_changes[i] = '1'

782

else:

783

mirrored_changes[i] = '0'

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

784

elif k == 11:

785

# change to ISO comment, ignore

786

pass

787

elif k == 12:

788

# change to simple uppercase mapping; ignore

789

pass

790

elif k == 13:

791

# change to simple lowercase mapping; ignore

792

pass

793

elif k == 14:

794

# change to simple titlecase mapping; ignore

795

pass

Florent Xicluna

2010-03-30 08:24:06 +0000

[diff] [blame]

796

elif k == 16:

797

# change to properties; not yet

798

pass

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

799

else:

800

class Difference(Exception):pass

801

raise Difference, (hex(i), k, old.table[i], new.table[i])

802

new.changed.append((version, zip(bidir_changes, category_changes,

Martin v. Löwis

2008-09-10 13:38:12 +0000

[diff] [blame]

803

decimal_changes, mirrored_changes,

804

numeric_changes),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

805

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

806

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

807

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

808

# --------------------------------------------------------------------

809

# the following support code is taken from the unidb utilities

810

811

812

# load a unicode-data file from disk

813

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

814

class UnicodeData:

Florent Xicluna

2010-03-30 08:24:06 +0000

[diff] [blame]

815

# Record structure:

816

# [ID, name, category, combining, bidi, decomp, (6)

817

# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)

818

# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)

819

# properties] (17)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

820

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

821

def __init__(self, filename, exclusions, eastasianwidth, unihan,

Florent Xicluna

2010-03-30 08:24:06 +0000

[diff] [blame]

822

derivednormalizationprops=None, linebreakprops=None,

823

expand=1):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

824

self.changed = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

825

file = open(filename)

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

826

table = [None] * 0x110000

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

while 1:

s = file.readline()

if not s:

break

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

831

s = s.strip().split(";")

832

char = int(s[0], 16)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

833

table[char] = s

834

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

835

# expand first-last ranges

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

836

if expand:

837

field = None

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

838

for i in range(0, 0x110000):

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

839

s = table[i]

840

if s:

841

if s[1][-6:] == "First>":

842

s[1] = ""

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

843

field = s

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

844

elif s[1][-5:] == "Last>":

845

s[1] = ""

846

field = None

847

elif field:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

848

f2 = field[:]

849

f2[0] = "%X" % i

850

table[i] = f2

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

851

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

852

# public attributes

853

self.filename = filename

854

self.table = table

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

855

self.chars = range(0x110000) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

856

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

857

file = open(exclusions)

self.exclusions = {}

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

char = int(s.split()[0],16)

866

self.exclusions[char] = 1

867

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

868

widths = [None] * 0x110000

869

for s in open(eastasianwidth):

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

s = s.split()[0].split(';')

876

if '..' in s[0]:

877

first, last = [int(c, 16) for c in s[0].split('..')]

878

chars = range(first, last+1)

879

else:

880

chars = [int(s[0], 16)]

881

for char in chars:

882

widths[char] = s[1]

883

for i in range(0, 0x110000):

884

if table[i] is not None:

885

table[i].append(widths[i])

Florent Xicluna

2010-03-30 08:24:06 +0000

[diff] [blame]

886

887

for i in range(0, 0x110000):

888

if table[i] is not None:

889

table[i].append(set())

890

if linebreakprops:

891

for s in open(linebreakprops):

892

s = s.partition('#')[0]

893

s = [i.strip() for i in s.split(';')]

894

if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:

895

continue

896

if '..' not in s[0]:

897

first = last = int(s[0], 16)

898

else:

899

first, last = [int(c, 16) for c in s[0].split('..')]

900

for char in range(first, last+1):

901

table[char][-1].add('Line_Break')

902

Antoine Pitrou

2009-04-27 21:53:26 +0000

[diff] [blame]

903

if derivednormalizationprops:

904

quickchecks = [0] * 0x110000 # default is Yes

905

qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()

906

for s in open(derivednormalizationprops):

907

if '#' in s:

908

s = s[:s.index('#')]

909

s = [i.strip() for i in s.split(';')]

910

if len(s) < 2 or s[1] not in qc_order:

911

continue

912

quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No

913

quickcheck_shift = qc_order.index(s[1])*2

914

quickcheck <<= quickcheck_shift

915

if '..' not in s[0]:

916

first = last = int(s[0], 16)

917

else:

918

first, last = [int(c, 16) for c in s[0].split('..')]

919

for char in range(first, last+1):

920

assert not (quickchecks[char]>>quickcheck_shift)&3

921

quickchecks[char] |= quickcheck

922

for i in range(0, 0x110000):

923

if table[i] is not None:

924

table[i].append(quickchecks[i])

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

925

Amaury Forgeot d'Arc

2009-10-06 19:56:32 +0000

[diff] [blame]

926

for line in open(unihan):

927

if not line.startswith('U+'):

928

continue

929

code, tag, value = line.split(None, 3)[:3]

930

if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',

931

'kOtherNumeric'):

932

continue

933

value = value.strip().replace(',', '')

934

i = int(code[2:], 16)

935

# Patch the numeric field

936

if table[i] is not None:

937

table[i][8] = value

938

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

939

def uselatin1(self):

940

# restrict character range to ISO Latin 1

941

self.chars = range(256)

942

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

943

# hash table tools

944

945

# this is a straight-forward reimplementation of Python's built-in

946

# dictionary type, using a static data structure, and a custom string

947

# hash algorithm.

948

949

def myhash(s, magic):

950

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

951

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

952

h = (h * magic) + c

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

953

ix = h & 0xff000000L

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

954

if ix:

955

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

return h

SIZES = [

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

960

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

961

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

962

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

]

class Hash:

def __init__(self, name, data, magic):

967

# turn a (key, value) list into a static hash table structure

968

969

# determine table size

970

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

raise AssertionError, "ran out of polynominals"

976

977

print size, "slots in hash table"

978

979

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

988

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

incr = (h ^ (h >> 3)) & mask;

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

print n, "collisions"

1010

self.collisions = n

1011

1012

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1022

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1023

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1024

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1025

file.write("#define %s_magic %d\n" % (self.name, self.magic))

1026

file.write("#define %s_size %d\n" % (self.name, self.size))

1027

file.write("#define %s_poly %d\n" % (self.name, self.poly))

1028

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1029

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1037

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1038

# write data to file, as a C array

1039

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1040

if trace:

1041

print >>sys.stderr, self.name+":", size*len(self.data), "bytes"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1042

file.write("static ")

1043

if size == 1:

1044

file.write("unsigned char")

1045

elif size == 2:

1046

file.write("unsigned short")

1047

else:

1048

file.write("unsigned int")

1049

file.write(" " + self.name + "[] = {\n")

1050

if self.data:

1051

s = " "

1052

for item in self.data:

1053

i = str(item) + ", "

1054

if len(s) + len(i) > 78:

file.write(s + "\n")

s = " " + i

else:

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1059

if s.strip():

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

file.write(s + "\n")

file.write("};\n\n")

def getsize(data):

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1073

def splitbins(t, trace=0):

1074

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

1075

1076

t is a sequence of ints. This function can be useful to save space if

1077

many of the ints are the same. t1 and t2 are lists of ints, and shift

1078

is an int, chosen to minimize the combined size of t1 and t2 (in C

1079

code), and where for each i in range(len(t)),

1080

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1081

where mask is a bitmask isolating the last "shift" bits.

1082

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1083

If optional arg trace is non-zero (default zero), progress info

1084

is printed to sys.stderr. The higher the value, the more info

1085

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1086

"""

1087

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1088

if trace:

1089

def dump(t1, t2, shift, bytes):

1090

print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (

1091

len(t1), len(t2), shift, bytes)

1092

print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \

1093

"bytes"

1094

n = len(t)-1 # last valid index

1095

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

bytes = sys.maxint # smallest total size so far

1102

t = tuple(t) # so slices can be dict keys

1103

for shift in range(maxshift + 1):

1104

t1 = []

1105

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1106

size = 2**shift

1107

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1108

for i in range(0, len(t), size):

1109

bin = t[i:i+size]

1110

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1111

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1112

index = len(t2)

1113

bincache[bin] = index

1114

t2.extend(bin)

1115

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1116

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1117

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1118

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1119

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1120

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1121

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1122

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1123

t1, t2, shift = best

1124

if trace:

1125

print >>sys.stderr, "Best:",

1126

dump(t1, t2, shift, bytes)

1127

if __debug__:

1128

# exhaustively verify that the decomposition is correct

1129

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

1130

for i in xrange(len(t)):

1131

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1132

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1133

1134

if __name__ == "__main__":

Fredrik Lundh