Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython3

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

4

# this script converts a unicode 3.2 database file to

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

5

# Modules/unicodedata_db.h, Modules/unicodename_db.h,

6

# and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

7

#

8

# history:

9

# 2000-09-24 fl created (based on bits and pieces from unidb)

10

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

11

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

12

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

13

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

14

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

15

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

16

# 2002-09-11 wd use string methods

17

# 2002-10-18 mvl update to Unicode 3.2

18

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

19

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

20

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

21

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

22

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

23

# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

24

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

25

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

#

import sys

SCRIPT = sys.argv[0]

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

31

VERSION = "2.6"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

32

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

33

# The Unicode Database

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

34

UNIDATA_VERSION = "5.1.0"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

35

UNICODE_DATA = "UnicodeData%s.txt"

36

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

37

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

38

DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

39

DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

40

41

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

42

43

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

44

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

45

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

46

"So" ]

47

48

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

49

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

50

"ON" ]

51

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

52

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

53

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

54

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

59

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

60

SPACE_MASK = 0x20

61

TITLE_MASK = 0x40

62

UPPER_MASK = 0x80

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

63

XID_START_MASK = 0x100

64

XID_CONTINUE_MASK = 0x200

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

65

PRINTABLE_MASK = 0x400

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

66

NODELTA_MASK = 0x800

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

67

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

68

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

69

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

70

print("--- Reading", UNICODE_DATA % "", "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

71

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

72

version = ""

73

unicode = UnicodeData(UNICODE_DATA % version,

74

COMPOSITION_EXCLUSIONS % version,

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

75

EASTASIAN_WIDTH % version,

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

76

DERIVED_CORE_PROPERTIES % version,

77

DERIVEDNORMALIZATION_PROPS % version)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

78

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

79

print(len(list(filter(None, unicode.table))), "characters")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

80

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

81

for version in old_versions:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

82

print("--- Reading", UNICODE_DATA % ("-"+version), "...")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

83

old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),

84

COMPOSITION_EXCLUSIONS % ("-"+version),

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

85

EASTASIAN_WIDTH % ("-"+version),

86

DERIVED_CORE_PROPERTIES % ("-"+version))

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

87

print(len(list(filter(None, old_unicode.table))), "characters")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

88

merge_old_version(version, unicode, old_unicode)

89

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

90

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

91

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

92

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

93

94

# --------------------------------------------------------------------

95

# unicode character properties

96

97

def makeunicodedata(unicode, trace):

98

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

99

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

100

table = [dummy]

101

cache = {0: dummy}

102

index = [0] * len(unicode.chars)

103

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

104

FILE = "Modules/unicodedata_db.h"

105

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

106

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

107

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

108

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

109

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

110

for char in unicode.chars:

111

record = unicode.table[char]

112

if record:

113

# extract database properties

114

category = CATEGORY_NAMES.index(record[2])

115

combining = int(record[3])

116

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

117

mirrored = record[9] == "Y"

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

118

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

119

normalizationquickcheck = record[17]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

120

item = (

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

121

category, combining, bidirectional, mirrored, eastasianwidth,

122

normalizationquickcheck

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

123

)

124

# add entry to index and item tables

125

i = cache.get(item)

126

if i is None:

127

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

131

# 2) decomposition data

132

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

133

decomp_data = [0]

134

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

135

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

136

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

137

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

138

comp_pairs = []

139

comp_first = [None] * len(unicode.chars)

140

comp_last = [None] * len(unicode.chars)

141

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

142

for char in unicode.chars:

143

record = unicode.table[char]

144

if record:

145

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

146

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

147

if len(decomp) > 19:

Collin Winter

2007-08-22 23:05:06 +0000

[diff] [blame]

148

raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

149

# prefix

150

if decomp[0][0] == "<":

151

prefix = decomp.pop(0)

152

else:

153

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

154

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

155

i = decomp_prefix.index(prefix)

156

except ValueError:

157

i = len(decomp_prefix)

158

decomp_prefix.append(prefix)

159

prefix = i

160

assert prefix < 256

161

# content

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

162

decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

163

# Collect NFC pairs

164

if not prefix and len(decomp) == 3 and \

165

char not in unicode.exclusions and \

166

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

171

try:

172

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

173

except ValueError:

174

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

175

decomp_data.extend(decomp)

176

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

177

else:

178

i = 0

179

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

180

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

181

f = l = 0

182

comp_first_ranges = []

183

comp_last_ranges = []

184

prev_f = prev_l = None

185

for i in unicode.chars:

186

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

192

prev_f = prev_f[0],i

193

else:

194

comp_first_ranges.append(prev_f)

195

prev_f = (i,i)

196

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

202

prev_l = prev_l[0],i

203

else:

204

comp_last_ranges.append(prev_l)

205

prev_l = (i,i)

206

comp_first_ranges.append(prev_f)

207

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

212

for f,l,char in comp_pairs:

213

f = comp_first[f]

214

l = comp_last[l]

215

comp_data[f*total_last+l] = char

216

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

217

print(len(table), "unique properties")

218

print(len(decomp_prefix), "unique decomposition prefixes")

219

print(len(decomp_data), "unique decomposition entries:", end=' ')

220

print(decomp_size, "bytes")

221

print(total_first, "first characters in NFC")

222

print(total_last, "last characters in NFC")

223

print(len(comp_pairs), "NFC pairs")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

224

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

225

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

226

Fred Drake

9c68505

2000-10-26 03:56:46 +0000

[diff] [blame]

227

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

228

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

229

print(file=fp)

230

print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)

231

print("/* a list of unique database records */", file=fp)

232

print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

233

for item in table:

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

234

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

235

print("};", file=fp)

236

print(file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

237

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

238

print("/* Reindexing of NFC first characters. */", file=fp)

239

print("#define TOTAL_FIRST",total_first, file=fp)

240

print("#define TOTAL_LAST",total_last, file=fp)

241

print("struct reindex{int start;short count,index;};", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

242

print("static struct reindex nfc_first[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

243

for start,end in comp_first_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

244

print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)

245

print(" {0,0,0}", file=fp)

246

print("};\n", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

247

print("static struct reindex nfc_last[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

248

for start,end in comp_last_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

249

print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)

250

print(" {0,0,0}", file=fp)

251

print("};\n", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

252

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

253

# FIXME: <fl> the following tables could be made static, and

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

254

# the support code moved into unicodedatabase.c

255

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

256

print("/* string literals */", file=fp)

257

print("const char *_PyUnicode_CategoryNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

258

for name in CATEGORY_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

259

print(" \"%s\"," % name, file=fp)

260

print(" NULL", file=fp)

261

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

262

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

263

print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

264

for name in BIDIRECTIONAL_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

265

print(" \"%s\"," % name, file=fp)

266

print(" NULL", file=fp)

267

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

268

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

269

print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

270

for name in EASTASIANWIDTH_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

271

print(" \"%s\"," % name, file=fp)

272

print(" NULL", file=fp)

273

print("};", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

274

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

275

print("static const char *decomp_prefix[] = {", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

276

for name in decomp_prefix:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

277

print(" \"%s\"," % name, file=fp)

278

print(" NULL", file=fp)

279

print("};", file=fp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

280

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

281

# split record index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

282

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

283

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

284

print("/* index tables for the database records */", file=fp)

285

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

286

Array("index1", index1).dump(fp, trace)

287

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

288

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

289

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

290

index1, index2, shift = splitbins(decomp_index, trace)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

291

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

292

print("/* decomposition data */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

293

Array("decomp_data", decomp_data).dump(fp, trace)

294

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

295

print("/* index tables for the decomposition data */", file=fp)

296

print("#define DECOMP_SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

297

Array("decomp_index1", index1).dump(fp, trace)

298

Array("decomp_index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

299

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

300

index, index2, shift = splitbins(comp_data, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

301

print("/* NFC pairs */", file=fp)

302

print("#define COMP_SHIFT", shift, file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

303

Array("comp_index", index).dump(fp, trace)

304

Array("comp_data", index2).dump(fp, trace)

305

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

306

# Generate delta tables for old versions

307

for version, table, normalization in unicode.changed:

308

cversion = version.replace(".","_")

309

records = [table[0]]

310

cache = {table[0]:0}

311

index = [0] * len(table)

312

for i, record in enumerate(table):

313

try:

314

index[i] = cache[record]

315

except KeyError:

316

index[i] = cache[record] = len(records)

317

records.append(record)

318

index1, index2, shift = splitbins(index, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

319

print("static const change_record change_records_%s[] = {" % cversion, file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

320

for record in records:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

321

print("\t{ %s }," % ", ".join(map(str,record)), file=fp)

322

print("};", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

323

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

324

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

325

print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)

326

print("{", file=fp)

327

print("\tint index;", file=fp)

328

print("\tif (n >= 0x110000) index = 0;", file=fp)

329

print("\telse {", file=fp)

330

print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)

331

print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \

332

(cversion, shift, ((1<<shift)-1)), file=fp)

333

print("\t}", file=fp)

334

print("\treturn change_records_%s+index;" % cversion, file=fp)

335

print("}\n", file=fp)

336

print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)

337

print("{", file=fp)

338

print("\tswitch(n) {", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

339

for k, v in normalization:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

340

print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)

341

print("\tdefault: return 0;", file=fp)

342

print("\t}\n}\n", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

343

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

344

fp.close()

345

346

# --------------------------------------------------------------------

347

# unicode character type tables

348

349

def makeunicodetype(unicode, trace):

350

351

FILE = "Objects/unicodetype_db.h"

352

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

353

print("--- Preparing", FILE, "...")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

354

355

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

356

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

357

table = [dummy]

358

cache = {0: dummy}

359

index = [0] * len(unicode.chars)

360

361

for char in unicode.chars:

362

record = unicode.table[char]

363

if record:

364

# extract database properties

365

category = record[2]

366

bidirectional = record[4]

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

367

properties = record[16]

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

368

flags = 0

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

369

delta = True

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

370

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

371

flags |= ALPHA_MASK

372

if category == "Ll":

373

flags |= LOWER_MASK

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

374

if category == "Zl" or bidirectional == "B":

375

flags |= LINEBREAK_MASK

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

376

if category == "Zs" or bidirectional in ("WS", "B", "S"):

377

flags |= SPACE_MASK

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

378

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

379

flags |= TITLE_MASK

380

if category == "Lu":

381

flags |= UPPER_MASK

Benjamin Peterson

0983274

2009-03-26 17:15:46 +0000

[diff] [blame]

382

if char == ord(" ") or category[0] not in ("C", "Z"):

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

383

flags |= PRINTABLE_MASK

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

384

if "XID_Start" in properties:

385

flags |= XID_START_MASK

386

if "XID_Continue" in properties:

387

flags |= XID_CONTINUE_MASK

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

388

# use delta predictor for upper/lower/title if it fits

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

389

if record[12]:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

390

upper = int(record[12], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

391

else:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

392

upper = char

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

393

if record[13]:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

394

lower = int(record[13], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

395

else:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

396

lower = char

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

397

if record[14]:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

398

title = int(record[14], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

399

else:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

400

# UCD.html says that a missing title char means that

401

# it defaults to the uppercase character, not to the

402

# character itself. Apparently, in the current UCD (5.x)

403

# this feature is never used

404

title = upper

405

upper_d = upper - char

406

lower_d = lower - char

407

title_d = title - char

408

if -32768 <= upper_d <= 32767 and \

409

-32768 <= lower_d <= 32767 and \

410

-32768 <= title_d <= 32767:

411

# use deltas

412

upper = upper_d & 0xffff

413

lower = lower_d & 0xffff

414

title = title_d & 0xffff

415

else:

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

416

flags |= NODELTA_MASK

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

417

# decimal digit, integer digit

418

decimal = 0

419

if record[6]:

420

flags |= DECIMAL_MASK

421

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

426

item = (

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

427

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

428

)

429

# add entry to index and item tables

430

i = cache.get(item)

431

if i is None:

432

cache[item] = i = len(table)

table.append(item)

index[char] = i

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

436

print(len(table), "unique character type entries")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

437

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

438

print("--- Writing", FILE, "...")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

439

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

440

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

441

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

442

print(file=fp)

443

print("/* a list of unique character type descriptors */", file=fp)

444

print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

445

for item in table:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

446

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

447

print("};", file=fp)

448

print(file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

449

450

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

451

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

452

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

453

print("/* type indexes */", file=fp)

454

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

455

Array("index1", index1).dump(fp, trace)

456

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

457

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

458

fp.close()

459

460

# --------------------------------------------------------------------

461

# unicode name database

462

463

def makeunicodename(unicode, trace):

464

465

FILE = "Modules/unicodename_db.h"

466

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

467

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

468

469

# collect names

470

names = [None] * len(unicode.chars)

471

472

for char in unicode.chars:

473

record = unicode.table[char]

474

if record:

475

name = record[1].strip()

476

if name and name[0] != "<":

477

names[char] = name + chr(0)

478

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

479

print(len(list(n for n in names if n is not None)), "distinct names")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

480

481

# collect unique words from names (note that we differ between

482

# words inside a sentence, and words ending a sentence. the

483

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

499

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

500

print(n, "words in text;", b, "bytes")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

501

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

502

wordlist = list(words.items())

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

503

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

504

# sort on falling frequency, then by name

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

505

def word_key(a):

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

506

aword, alist = a

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

507

return -len(alist), aword

508

wordlist.sort(key=word_key)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

509

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

510

# figure out how many phrasebook escapes we need

511

escapes = 0

512

while escapes * 256 < len(wordlist):

513

escapes = escapes + 1

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

514

print(escapes, "escapes")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

515

516

short = 256 - escapes

assert short > 0

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

520

print(short, "short indexes in lexicon")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

521

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

522

# statistics

523

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

524

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

525

n = n + len(wordlist[i][1])

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

526

print(n, "short indexes in phrasebook")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

527

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

528

# pick the most commonly used words, and sort the rest on falling

529

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

530

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

531

wordlist, wordtail = wordlist[:short], wordlist[short:]

Raymond Hettinger

d4cb56d

2008-01-30 02:55:10 +0000

[diff] [blame]

532

wordtail.sort(key=lambda a: a[0], reverse=True)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

533

wordlist.extend(wordtail)

534

535

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

542

offset = 0

543

for w, x in wordlist:

544

# encoding: bit 7 indicates last character in word (chr(128)

545

# indicates the last character in an entire string)

546

ww = w[:-1] + chr(ord(w[-1])+128)

547

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

548

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

549

if o < 0:

550

o = offset

551

lexicon = lexicon + ww

552

offset = offset + len(w)

553

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

554

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

555

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

556

lexicon = list(map(ord, lexicon))

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

557

558

# generate phrasebook from names and lexicon

559

phrasebook = [0]

560

phrasebook_offset = [0] * len(unicode.chars)

561

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

566

for w in w:

567

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

568

if i < short:

569

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

570

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

571

# store as two bytes

572

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

573

phrasebook.append(i&255)

574

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

575

assert getsize(phrasebook) == 1

576

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

577

#

578

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

583

record = unicode.table[char]

584

if record:

585

name = record[1].strip()

586

if name and name[0] != "<":

587

data.append((name, char))

588

589

# the magic number 47 was chosen to minimize the number of

590

# collisions on the current data set. if you like, change it

591

# and see what happens...

592

593

codehash = Hash("code", data, 47)

594

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

595

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

596

597

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

598

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

599

print(file=fp)

600

print("#define NAME_MAXLEN", 256, file=fp)

601

print(file=fp)

602

print("/* lexicon */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

603

Array("lexicon", lexicon).dump(fp, trace)

604

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

605

606

# split decomposition index table

607

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

608

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

609

print("/* code->name phrasebook */", file=fp)

610

print("#define phrasebook_shift", shift, file=fp)

611

print("#define phrasebook_short", short, file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

612

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

613

Array("phrasebook", phrasebook).dump(fp, trace)

614

Array("phrasebook_offset1", offset1).dump(fp, trace)

615

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

616

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

617

print("/* name->code dictionary */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

618

codehash.dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

fp.close()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

622

623

def merge_old_version(version, new, old):

624

# Changes to exclusion file not implemented yet

625

if old.exclusions != new.exclusions:

Collin Winter

2007-08-22 23:05:06 +0000

[diff] [blame]

626

raise NotImplementedError("exclusions differ")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

627

628

# In these change records, 0xFF means "no change"

629

bidir_changes = [0xFF]*0x110000

630

category_changes = [0xFF]*0x110000

631

decimal_changes = [0xFF]*0x110000

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

632

mirrored_changes = [0xFF]*0x110000

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

633

# In numeric data, 0 means "no change",

634

# -1 means "did not have a numeric value

635

numeric_changes = [0] * 0x110000

636

# normalization_changes is a list of key-value pairs

637

normalization_changes = []

638

for i in range(0x110000):

639

if new.table[i] is None:

640

# Characters unassigned in the new version ought to

641

# be unassigned in the old one

642

assert old.table[i] is None

643

continue

644

# check characters unassigned in the old version

645

if old.table[i] is None:

646

# category 0 is "unassigned"

647

category_changes[i] = 0

648

continue

649

# check characters that differ

650

if old.table[i] != new.table[i]:

651

for k in range(len(old.table[i])):

652

if old.table[i][k] != new.table[i][k]:

653

value = old.table[i][k]

654

if k == 2:

655

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

656

category_changes[i] = CATEGORY_NAMES.index(value)

657

elif k == 4:

658

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

659

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

660

elif k == 5:

661

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

662

# We assume that all normalization changes are in 1:1 mappings

663

assert " " not in value

664

normalization_changes.append((i, value))

665

elif k == 6:

666

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

667

# we only support changes where the old value is a single digit

668

assert value in "0123456789"

669

decimal_changes[i] = int(value)

670

elif k == 8:

671

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

672

# Since 0 encodes "no change", the old value is better not 0

673

assert value != "0" and value != "-1"

674

if not value:

675

numeric_changes[i] = -1

676

else:

677

assert re.match("^[0-9]+$", value)

678

numeric_changes[i] = int(value)

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

679

elif k == 9:

680

if value == 'Y':

681

mirrored_changes[i] = '1'

682

else:

683

mirrored_changes[i] = '0'

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

684

elif k == 11:

685

# change to ISO comment, ignore

686

pass

687

elif k == 12:

688

# change to simple uppercase mapping; ignore

689

pass

690

elif k == 13:

691

# change to simple lowercase mapping; ignore

692

pass

693

elif k == 14:

694

# change to simple titlecase mapping; ignore

695

pass

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

696

elif k == 16:

697

# derived property changes; not yet

698

pass

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

699

else:

700

class Difference(Exception):pass

Collin Winter

2007-08-22 23:05:06 +0000

[diff] [blame]

701

raise Difference(hex(i), k, old.table[i], new.table[i])

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

702

new.changed.append((version, list(zip(bidir_changes, category_changes,

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

703

decimal_changes, mirrored_changes,

704

numeric_changes)),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

705

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

706

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

707

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

708

# --------------------------------------------------------------------

709

# the following support code is taken from the unidb utilities

710

711

712

# load a unicode-data file from disk

713

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

714

import sys

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

715

716

class UnicodeData:

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

717

# Record structure:

718

# [ID, name, category, combining, bidi, decomp, (6)

719

# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)

720

# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)

721

# derived-props] (17)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

722

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

723

def __init__(self, filename, exclusions, eastasianwidth,

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

724

derivedprops, derivednormalizationprops=None, expand=1):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

725

self.changed = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

726

file = open(filename)

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

727

table = [None] * 0x110000

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

while 1:

s = file.readline()

if not s:

break

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

732

s = s.strip().split(";")

733

char = int(s[0], 16)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

734

table[char] = s

735

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

736

# expand first-last ranges

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

737

if expand:

738

field = None

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

739

for i in range(0, 0x110000):

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

740

s = table[i]

741

if s:

742

if s[1][-6:] == "First>":

743

s[1] = ""

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

744

field = s

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

745

elif s[1][-5:] == "Last>":

746

s[1] = ""

747

field = None

748

elif field:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

749

f2 = field[:]

750

f2[0] = "%X" % i

751

table[i] = f2

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

752

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

753

# public attributes

754

self.filename = filename

755

self.table = table

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

756

self.chars = list(range(0x110000)) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

757

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

758

file = open(exclusions)

self.exclusions = {}

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

char = int(s.split()[0],16)

767

self.exclusions[char] = 1

768

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

769

widths = [None] * 0x110000

770

for s in open(eastasianwidth):

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

s = s.split()[0].split(';')

777

if '..' in s[0]:

778

first, last = [int(c, 16) for c in s[0].split('..')]

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

779

chars = list(range(first, last+1))

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

780

else:

781

chars = [int(s[0], 16)]

782

for char in chars:

783

widths[char] = s[1]

784

for i in range(0, 0x110000):

785

if table[i] is not None:

786

table[i].append(widths[i])

787

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

788

for i in range(0, 0x110000):

789

if table[i] is not None:

790

table[i].append(set())

791

for s in open(derivedprops):

792

s = s.split('#', 1)[0].strip()

if not s:

continue

r, p = s.split(";")

r = r.strip()

p = p.strip()

if ".." in r:

first, last = [int(c, 16) for c in r.split('..')]

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

801

chars = list(range(first, last+1))

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

else:

chars = [int(r, 16)]

for char in chars:

if table[char]:

# Some properties (e.g. Default_Ignorable_Code_Point)

807

# apply to unassigned code points; ignore them

808

table[char][-1].add(p)

809

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

810

if derivednormalizationprops:

811

quickchecks = [0] * 0x110000 # default is Yes

812

qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()

813

for s in open(derivednormalizationprops):

814

if '#' in s:

815

s = s[:s.index('#')]

816

s = [i.strip() for i in s.split(';')]

817

if len(s) < 2 or s[1] not in qc_order:

818

continue

819

quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No

820

quickcheck_shift = qc_order.index(s[1])*2

821

quickcheck <<= quickcheck_shift

822

if '..' not in s[0]:

823

first = last = int(s[0], 16)

824

else:

825

first, last = [int(c, 16) for c in s[0].split('..')]

826

for char in range(first, last+1):

827

assert not (quickchecks[char]>>quickcheck_shift)&3

828

quickchecks[char] |= quickcheck

829

for i in range(0, 0x110000):

830

if table[i] is not None:

831

table[i].append(quickchecks[i])

832

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

833

def uselatin1(self):

834

# restrict character range to ISO Latin 1

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

835

self.chars = list(range(256))

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

836

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

837

# hash table tools

838

839

# this is a straight-forward reimplementation of Python's built-in

840

# dictionary type, using a static data structure, and a custom string

841

# hash algorithm.

842

843

def myhash(s, magic):

844

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

845

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

846

h = (h * magic) + c

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

847

ix = h & 0xff000000

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

848

if ix:

849

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

return h

SIZES = [

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

854

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

855

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

856

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

]

class Hash:

def __init__(self, name, data, magic):

861

# turn a (key, value) list into a static hash table structure

862

863

# determine table size

864

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

Collin Winter

2007-08-22 23:05:06 +0000

[diff] [blame]

869

raise AssertionError("ran out of polynominals")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

870

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

871

print(size, "slots in hash table")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

872

873

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

882

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

incr = (h ^ (h >> 3)) & mask;

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

903

print(n, "collisions")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

904

self.collisions = n

905

906

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

916

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

917

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

918

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

919

file.write("#define %s_magic %d\n" % (self.name, self.magic))

920

file.write("#define %s_size %d\n" % (self.name, self.size))

921

file.write("#define %s_poly %d\n" % (self.name, self.poly))

922

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

923

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

931

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

932

# write data to file, as a C array

933

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

934

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

935

print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

936

file.write("static ")

937

if size == 1:

938

file.write("unsigned char")

939

elif size == 2:

940

file.write("unsigned short")

941

else:

942

file.write("unsigned int")

943

file.write(" " + self.name + "[] = {\n")

944

if self.data:

945

s = " "

946

for item in self.data:

947

i = str(item) + ", "

948

if len(s) + len(i) > 78:

file.write(s + "\n")

s = " " + i

else:

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

953

if s.strip():

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

file.write(s + "\n")

file.write("};\n\n")

def getsize(data):

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

967

def splitbins(t, trace=0):

968

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

969

970

t is a sequence of ints. This function can be useful to save space if

971

many of the ints are the same. t1 and t2 are lists of ints, and shift

972

is an int, chosen to minimize the combined size of t1 and t2 (in C

973

code), and where for each i in range(len(t)),

974

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

975

where mask is a bitmask isolating the last "shift" bits.

976

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

977

If optional arg trace is non-zero (default zero), progress info

978

is printed to sys.stderr. The higher the value, the more info

979

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

"""

import sys

if trace:

def dump(t1, t2, shift, bytes):

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

985

print("%d+%d bins at shift %d; %d bytes" % (

986

len(t1), len(t2), shift, bytes), file=sys.stderr)

987

print("Size of original table:", len(t)*getsize(t), \

988

"bytes", file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

989

n = len(t)-1 # last valid index

990

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

Christian Heimes

a37d4c6

2007-12-04 23:02:19 +0000

[diff] [blame]

996

bytes = sys.maxsize # smallest total size so far

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

997

t = tuple(t) # so slices can be dict keys

998

for shift in range(maxshift + 1):

999

t1 = []

1000

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1001

size = 2**shift

1002

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1003

for i in range(0, len(t), size):

1004

bin = t[i:i+size]

1005

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1006

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1007

index = len(t2)

1008

bincache[bin] = index

1009

t2.extend(bin)

1010

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1011

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1012

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1013

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1014

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1015

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1016

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1017

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1018

t1, t2, shift = best

1019

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1020

print("Best:", end=' ', file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1021

dump(t1, t2, shift, bytes)

1022

if __debug__:

1023

# exhaustively verify that the decomposition is correct

1024

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

Guido van Rossum

805365e

2007-05-07 22:24:25 +0000

[diff] [blame]

1025

for i in range(len(t)):

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1026

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1027

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1028

1029

if __name__ == "__main__":

Fredrik Lundh