Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython3

2000-09-24 23:18:31 +0000

[diff] [blame]

1

#

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

2

# (re)generate unicode property and type databases

3

#

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

4

# this script converts a unicode 3.2 database file to

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

5

# Modules/unicodedata_db.h, Modules/unicodename_db.h,

6

# and Objects/unicodetype_db.h

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

7

#

8

# history:

9

# 2000-09-24 fl created (based on bits and pieces from unidb)

10

# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

11

# 2000-09-25 fl added character type table

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

12

# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

13

# 2000-11-03 fl expand first/last ranges

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

14

# 2001-01-19 fl added character name tables (2.1)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

15

# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

16

# 2002-09-11 wd use string methods

17

# 2002-10-18 mvl update to Unicode 3.2

18

# 2002-10-22 mvl generate NFC tables

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

19

# 2002-11-24 mvl expand all ranges, sort names version-independently

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

20

# 2002-11-25 mvl add UNIDATA_VERSION

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

21

# 2004-05-29 perky add east asian width information

Martin v. Löwis

43179c8

2006-03-11 12:43:44 +0000

[diff] [blame]

22

# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

23

# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

24

#

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

25

# written by Fredrik Lundh (fredrik@pythonware.com)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

26

#

27

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

28

import sys, os, zipfile

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

29

30

SCRIPT = sys.argv[0]

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

31

VERSION = "3.2"

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

32

Martin v. Löwis

b5c980b

2002-11-25 09:13:37 +0000

[diff] [blame]

33

# The Unicode Database

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

34

UNIDATA_VERSION = "6.0.0"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

35

UNICODE_DATA = "UnicodeData%s.txt"

36

COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

37

EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

38

UNIHAN = "Unihan%s.zip"

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

39

DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

40

DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

41

LINE_BREAK = "LineBreak%s.txt"

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

42

43

old_versions = ["3.2.0"]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

44

45

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

46

"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",

47

"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",

48

"So" ]

49

50

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",

51

"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",

52

"ON" ]

53

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

54

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

55

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

56

MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]

57

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

58

# note: should match definitions in Objects/unicodectype.c

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

ALPHA_MASK = 0x01

DECIMAL_MASK = 0x02

DIGIT_MASK = 0x04

LOWER_MASK = 0x08

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

63

LINEBREAK_MASK = 0x10

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

64

SPACE_MASK = 0x20

65

TITLE_MASK = 0x40

66

UPPER_MASK = 0x80

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

67

XID_START_MASK = 0x100

68

XID_CONTINUE_MASK = 0x200

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

69

PRINTABLE_MASK = 0x400

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

70

NODELTA_MASK = 0x800

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

71

NUMERIC_MASK = 0x1000

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

72

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

73

# these ranges need to match unicodedata.c:is_unified_ideograph

cjk_ranges = [

('3400', '4DB5'),

('4E00', '9FCB'),

('20000', '2A6D6'),

('2A700', '2B734'),

('2B740', '2B81D')

]

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

82

def maketables(trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

83

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

84

print("--- Reading", UNICODE_DATA % "", "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

85

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

86

version = ""

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

87

unicode = UnicodeData(UNIDATA_VERSION)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

88

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

89

print(len(list(filter(None, unicode.table))), "characters")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

90

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

91

for version in old_versions:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

92

print("--- Reading", UNICODE_DATA % ("-"+version), "...")

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

93

old_unicode = UnicodeData(version, cjk_check=False)

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

94

print(len(list(filter(None, old_unicode.table))), "characters")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

95

merge_old_version(version, unicode, old_unicode)

96

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

97

makeunicodename(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

98

makeunicodedata(unicode, trace)

Fredrik Lundh

b2dfd73

2001-01-21 23:31:52 +0000

[diff] [blame]

99

makeunicodetype(unicode, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

100

101

# --------------------------------------------------------------------

102

# unicode character properties

103

104

def makeunicodedata(unicode, trace):

105

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

106

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

107

table = [dummy]

108

cache = {0: dummy}

109

index = [0] * len(unicode.chars)

110

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

111

FILE = "Modules/unicodedata_db.h"

112

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

113

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

114

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

115

# 1) database properties

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

116

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

117

for char in unicode.chars:

118

record = unicode.table[char]

119

if record:

120

# extract database properties

121

category = CATEGORY_NAMES.index(record[2])

122

combining = int(record[3])

123

bidirectional = BIDIRECTIONAL_NAMES.index(record[4])

124

mirrored = record[9] == "Y"

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

125

eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

126

normalizationquickcheck = record[17]

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

127

item = (

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

128

category, combining, bidirectional, mirrored, eastasianwidth,

129

normalizationquickcheck

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

130

)

131

# add entry to index and item tables

132

i = cache.get(item)

133

if i is None:

134

cache[item] = i = len(table)

table.append(item)

index[char] = i

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

138

# 2) decomposition data

139

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

140

decomp_data = [0]

141

decomp_prefix = [""]

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

142

decomp_index = [0] * len(unicode.chars)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

143

decomp_size = 0

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

144

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

145

comp_pairs = []

146

comp_first = [None] * len(unicode.chars)

147

comp_last = [None] * len(unicode.chars)

148

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

149

for char in unicode.chars:

150

record = unicode.table[char]

151

if record:

152

if record[5]:

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

153

decomp = record[5].split()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

154

if len(decomp) > 19:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

155

raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

156

# prefix

157

if decomp[0][0] == "<":

158

prefix = decomp.pop(0)

159

else:

160

prefix = ""

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

161

try:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

162

i = decomp_prefix.index(prefix)

163

except ValueError:

164

i = len(decomp_prefix)

165

decomp_prefix.append(prefix)

166

prefix = i

167

assert prefix < 256

168

# content

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

169

decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

170

# Collect NFC pairs

171

if not prefix and len(decomp) == 3 and \

172

char not in unicode.exclusions and \

173

unicode.table[decomp[1]][3] == "0":

p, l, r = decomp

comp_first[l] = 1

comp_last[r] = 1

comp_pairs.append((l,r,char))

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

178

try:

179

i = decomp_data.index(decomp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

180

except ValueError:

181

i = len(decomp_data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

182

decomp_data.extend(decomp)

183

decomp_size = decomp_size + len(decomp) * 2

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

184

else:

185

i = 0

186

decomp_index[char] = i

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

187

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

188

f = l = 0

189

comp_first_ranges = []

190

comp_last_ranges = []

191

prev_f = prev_l = None

192

for i in unicode.chars:

193

if comp_first[i] is not None:

comp_first[i] = f

f += 1

if prev_f is None:

prev_f = (i,i)

elif prev_f[1]+1 == i:

199

prev_f = prev_f[0],i

200

else:

201

comp_first_ranges.append(prev_f)

202

prev_f = (i,i)

203

if comp_last[i] is not None:

comp_last[i] = l

l += 1

if prev_l is None:

prev_l = (i,i)

elif prev_l[1]+1 == i:

209

prev_l = prev_l[0],i

210

else:

211

comp_last_ranges.append(prev_l)

212

prev_l = (i,i)

213

comp_first_ranges.append(prev_f)

214

comp_last_ranges.append(prev_l)

total_first = f

total_last = l

comp_data = [0]*(total_first*total_last)

219

for f,l,char in comp_pairs:

220

f = comp_first[f]

221

l = comp_last[l]

222

comp_data[f*total_last+l] = char

223

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

224

print(len(table), "unique properties")

225

print(len(decomp_prefix), "unique decomposition prefixes")

226

print(len(decomp_data), "unique decomposition entries:", end=' ')

227

print(decomp_size, "bytes")

228

print(total_first, "first characters in NFC")

229

print(total_last, "last characters in NFC")

230

print(len(comp_pairs), "NFC pairs")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

231

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

232

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

233

Fred Drake

9c68505

2000-10-26 03:56:46 +0000

[diff] [blame]

234

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

235

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

236

print(file=fp)

237

print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)

238

print("/* a list of unique database records */", file=fp)

239

print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

240

for item in table:

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

241

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

242

print("};", file=fp)

243

print(file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

244

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

245

print("/* Reindexing of NFC first characters. */", file=fp)

246

print("#define TOTAL_FIRST",total_first, file=fp)

247

print("#define TOTAL_LAST",total_last, file=fp)

248

print("struct reindex{int start;short count,index;};", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

249

print("static struct reindex nfc_first[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

250

for start,end in comp_first_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

251

print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)

252

print(" {0,0,0}", file=fp)

253

print("};\n", file=fp)

Martin v. Löwis

59683e8

2008-06-13 07:50:45 +0000

[diff] [blame]

254

print("static struct reindex nfc_last[] = {", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

255

for start,end in comp_last_ranges:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

256

print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)

257

print(" {0,0,0}", file=fp)

258

print("};\n", file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

259

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

260

# FIXME: <fl> the following tables could be made static, and

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

261

# the support code moved into unicodedatabase.c

262

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

263

print("/* string literals */", file=fp)

264

print("const char *_PyUnicode_CategoryNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

265

for name in CATEGORY_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

266

print(" \"%s\"," % name, file=fp)

267

print(" NULL", file=fp)

268

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

269

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

270

print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

271

for name in BIDIRECTIONAL_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

272

print(" \"%s\"," % name, file=fp)

273

print(" NULL", file=fp)

274

print("};", file=fp)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

275

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

276

print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

277

for name in EASTASIANWIDTH_NAMES:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

278

print(" \"%s\"," % name, file=fp)

279

print(" NULL", file=fp)

280

print("};", file=fp)

Hye-Shik Chang

2004-08-04 07:38:35 +0000

[diff] [blame]

281

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

282

print("static const char *decomp_prefix[] = {", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

283

for name in decomp_prefix:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

284

print(" \"%s\"," % name, file=fp)

285

print(" NULL", file=fp)

286

print("};", file=fp)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

287

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

288

# split record index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

289

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

290

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

291

print("/* index tables for the database records */", file=fp)

292

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

293

Array("index1", index1).dump(fp, trace)

294

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

295

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

296

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

297

index1, index2, shift = splitbins(decomp_index, trace)

Fredrik Lundh

2000-09-25 08:07:06 +0000

[diff] [blame]

298

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

299

print("/* decomposition data */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

300

Array("decomp_data", decomp_data).dump(fp, trace)

301

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

302

print("/* index tables for the decomposition data */", file=fp)

303

print("#define DECOMP_SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

304

Array("decomp_index1", index1).dump(fp, trace)

305

Array("decomp_index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

306

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

307

index, index2, shift = splitbins(comp_data, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

308

print("/* NFC pairs */", file=fp)

309

print("#define COMP_SHIFT", shift, file=fp)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

310

Array("comp_index", index).dump(fp, trace)

311

Array("comp_data", index2).dump(fp, trace)

312

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

313

# Generate delta tables for old versions

314

for version, table, normalization in unicode.changed:

315

cversion = version.replace(".","_")

316

records = [table[0]]

317

cache = {table[0]:0}

318

index = [0] * len(table)

319

for i, record in enumerate(table):

320

try:

321

index[i] = cache[record]

322

except KeyError:

323

index[i] = cache[record] = len(records)

324

records.append(record)

325

index1, index2, shift = splitbins(index, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

326

print("static const change_record change_records_%s[] = {" % cversion, file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

327

for record in records:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

328

print("\t{ %s }," % ", ".join(map(str,record)), file=fp)

329

print("};", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

330

Array("changes_%s_index" % cversion, index1).dump(fp, trace)

331

Array("changes_%s_data" % cversion, index2).dump(fp, trace)

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

332

print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)

333

print("{", file=fp)

334

print("\tint index;", file=fp)

335

print("\tif (n >= 0x110000) index = 0;", file=fp)

336

print("\telse {", file=fp)

337

print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)

338

print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \

339

(cversion, shift, ((1<<shift)-1)), file=fp)

340

print("\t}", file=fp)

341

print("\treturn change_records_%s+index;" % cversion, file=fp)

342

print("}\n", file=fp)

343

print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)

344

print("{", file=fp)

345

print("\tswitch(n) {", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

346

for k, v in normalization:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

347

print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)

348

print("\tdefault: return 0;", file=fp)

349

print("\t}\n}\n", file=fp)

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

350

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

351

fp.close()

352

353

# --------------------------------------------------------------------

354

# unicode character type tables

355

356

def makeunicodetype(unicode, trace):

357

358

FILE = "Objects/unicodetype_db.h"

359

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

360

print("--- Preparing", FILE, "...")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

361

362

# extract unicode types

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

363

dummy = (0, 0, 0, 0, 0, 0)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

364

table = [dummy]

365

cache = {0: dummy}

366

index = [0] * len(unicode.chars)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

367

numeric = {}

368

spaces = []

369

linebreaks = []

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

370

371

for char in unicode.chars:

372

record = unicode.table[char]

373

if record:

374

# extract database properties

375

category = record[2]

376

bidirectional = record[4]

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

377

properties = record[16]

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

378

flags = 0

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

379

delta = True

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

380

if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:

381

flags |= ALPHA_MASK

382

if category == "Ll":

383

flags |= LOWER_MASK

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

384

if 'Line_Break' in properties or bidirectional == "B":

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

385

flags |= LINEBREAK_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

386

linebreaks.append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

387

if category == "Zs" or bidirectional in ("WS", "B", "S"):

388

flags |= SPACE_MASK

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

389

spaces.append(char)

Fredrik Lundh

375732c

2000-09-25 23:03:34 +0000

[diff] [blame]

390

if category == "Lt":

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

391

flags |= TITLE_MASK

392

if category == "Lu":

393

flags |= UPPER_MASK

Benjamin Peterson

0983274

2009-03-26 17:15:46 +0000

[diff] [blame]

394

if char == ord(" ") or category[0] not in ("C", "Z"):

Georg Brandl

d52429f

2008-07-04 15:55:02 +0000

[diff] [blame]

395

flags |= PRINTABLE_MASK

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

396

if "XID_Start" in properties:

397

flags |= XID_START_MASK

398

if "XID_Continue" in properties:

399

flags |= XID_CONTINUE_MASK

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

400

# use delta predictor for upper/lower/title if it fits

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

401

if record[12]:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

402

upper = int(record[12], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

403

else:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

404

upper = char

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

405

if record[13]:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

406

lower = int(record[13], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

407

else:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

408

lower = char

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

409

if record[14]:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

410

title = int(record[14], 16)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

411

else:

Walter Dörwald

2009-04-25 14:13:56 +0000

[diff] [blame]

412

# UCD.html says that a missing title char means that

413

# it defaults to the uppercase character, not to the

414

# character itself. Apparently, in the current UCD (5.x)

415

# this feature is never used

416

title = upper

417

upper_d = upper - char

418

lower_d = lower - char

419

title_d = title - char

420

if -32768 <= upper_d <= 32767 and \

421

-32768 <= lower_d <= 32767 and \

422

-32768 <= title_d <= 32767:

423

# use deltas

424

upper = upper_d & 0xffff

425

lower = lower_d & 0xffff

426

title = title_d & 0xffff

427

else:

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

428

flags |= NODELTA_MASK

Fredrik Lundh

2000-09-25 21:01:56 +0000

[diff] [blame]

429

# decimal digit, integer digit

430

decimal = 0

431

if record[6]:

432

flags |= DECIMAL_MASK

433

decimal = int(record[6])

digit = 0

if record[7]:

flags |= DIGIT_MASK

digit = int(record[7])

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

438

if record[8]:

439

flags |= NUMERIC_MASK

440

numeric.setdefault(record[8], []).append(char)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

441

item = (

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

442

upper, lower, title, decimal, digit, flags

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

443

)

444

# add entry to index and item tables

445

i = cache.get(item)

446

if i is None:

447

cache[item] = i = len(table)

table.append(item)

index[char] = i

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

451

print(len(table), "unique character type entries")

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

452

print(sum(map(len, numeric.values())), "numeric code points")

453

print(len(spaces), "whitespace code points")

454

print(len(linebreaks), "linebreak code points")

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

455

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

456

print("--- Writing", FILE, "...")

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

457

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

458

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

459

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

460

print(file=fp)

461

print("/* a list of unique character type descriptors */", file=fp)

462

print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

463

for item in table:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

464

print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)

465

print("};", file=fp)

466

print(file=fp)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

467

468

# split decomposition index table

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

469

index1, index2, shift = splitbins(index, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

470

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

471

print("/* type indexes */", file=fp)

472

print("#define SHIFT", shift, file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

473

Array("index1", index1).dump(fp, trace)

474

Array("index2", index2).dump(fp, trace)

Fredrik Lundh

2000-09-25 17:59:57 +0000

[diff] [blame]

475

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

476

# Generate code for _PyUnicode_ToNumeric()

477

numeric_items = sorted(numeric.items())

478

print('/* Returns the numeric value as double for Unicode characters', file=fp)

479

print(' * having this property, -1.0 otherwise.', file=fp)

480

print(' */', file=fp)

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

481

print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

482

print('{', file=fp)

483

print(' switch (ch) {', file=fp)

484

for value, codepoints in numeric_items:

Amaury Forgeot d'Arc

919765a

2009-10-13 23:18:53 +0000

[diff] [blame]

485

# Turn text into float literals

486

parts = value.split('/')

487

parts = [repr(float(part)) for part in parts]

488

value = '/'.join(parts)

489

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

490

codepoints.sort()

491

for codepoint in codepoints:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

492

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

493

print(' return (double) %s;' % (value,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

494

print(' }', file=fp)

495

print(' return -1.0;', file=fp)

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsWhitespace()

500

print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)

501

print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)

502

print(" */", file=fp)

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

503

print('int _PyUnicode_IsWhitespace(register const Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

504

print('{', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

505

print(' switch (ch) {', file=fp)

506

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

507

for codepoint in sorted(spaces):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

508

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

509

print(' return 1;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

510

511

print(' }', file=fp)

512

print(' return 0;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

print('}', file=fp)

print(file=fp)

# Generate code for _PyUnicode_IsLinebreak()

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

517

print("/* Returns 1 for Unicode characters having the line break", file=fp)

518

print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)

519

print(" * type 'B', 0 otherwise.", file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

520

print(" */", file=fp)

Amaury Forgeot d'Arc

2010-08-18 20:44:58 +0000

[diff] [blame]

521

print('int _PyUnicode_IsLinebreak(register const Py_UCS4 ch)', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

522

print('{', file=fp)

523

print(' switch (ch) {', file=fp)

Florent Xicluna

f089fd6

2010-03-19 14:25:03 +0000

[diff] [blame]

524

for codepoint in sorted(linebreaks):

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

525

print(' case 0x%04X:' % (codepoint,), file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

526

print(' return 1;', file=fp)

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

527

528

print(' }', file=fp)

529

print(' return 0;', file=fp)

print('}', file=fp)

print(file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

533

fp.close()

534

535

# --------------------------------------------------------------------

536

# unicode name database

537

538

def makeunicodename(unicode, trace):

539

540

FILE = "Modules/unicodename_db.h"

541

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

542

print("--- Preparing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

543

544

# collect names

545

names = [None] * len(unicode.chars)

546

547

for char in unicode.chars:

548

record = unicode.table[char]

549

if record:

550

name = record[1].strip()

551

if name and name[0] != "<":

552

names[char] = name + chr(0)

553

Georg Brandl

559e5d7

2008-06-11 18:37:52 +0000

[diff] [blame]

554

print(len(list(n for n in names if n is not None)), "distinct names")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

555

556

# collect unique words from names (note that we differ between

557

# words inside a sentence, and words ending a sentence. the

558

# latter includes the trailing null byte.

words = {}

n = b = 0

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

b = b + len(name)

n = n + len(w)

for w in w:

l = words.get(w)

if l:

l.append(None)

else:

words[w] = [len(words)]

574

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

575

print(n, "words in text;", b, "bytes")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

576

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

577

wordlist = list(words.items())

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

578

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

579

# sort on falling frequency, then by name

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

580

def word_key(a):

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

581

aword, alist = a

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

582

return -len(alist), aword

583

wordlist.sort(key=word_key)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

584

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

585

# figure out how many phrasebook escapes we need

586

escapes = 0

587

while escapes * 256 < len(wordlist):

588

escapes = escapes + 1

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

589

print(escapes, "escapes")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

590

591

short = 256 - escapes

assert short > 0

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

595

print(short, "short indexes in lexicon")

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

596

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

597

# statistics

598

n = 0

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

599

for i in range(short):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

600

n = n + len(wordlist[i][1])

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

601

print(n, "short indexes in phrasebook")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

602

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

603

# pick the most commonly used words, and sort the rest on falling

604

# length (to maximize overlap)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

605

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

606

wordlist, wordtail = wordlist[:short], wordlist[short:]

Raymond Hettinger

d4cb56d

2008-01-30 02:55:10 +0000

[diff] [blame]

607

wordtail.sort(key=lambda a: a[0], reverse=True)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

608

wordlist.extend(wordtail)

609

610

# generate lexicon from words

lexicon_offset = [0]

lexicon = ""

words = {}

# build a lexicon string

617

offset = 0

618

for w, x in wordlist:

619

# encoding: bit 7 indicates last character in word (chr(128)

620

# indicates the last character in an entire string)

621

ww = w[:-1] + chr(ord(w[-1])+128)

622

# reuse string tails, when possible

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

623

o = lexicon.find(ww)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

624

if o < 0:

625

o = offset

626

lexicon = lexicon + ww

627

offset = offset + len(w)

628

words[w] = len(lexicon_offset)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

629

lexicon_offset.append(o)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

630

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

631

lexicon = list(map(ord, lexicon))

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

632

633

# generate phrasebook from names and lexicon

634

phrasebook = [0]

635

phrasebook_offset = [0] * len(unicode.chars)

636

for char in unicode.chars:

name = names[char]

if name:

w = name.split()

phrasebook_offset[char] = len(phrasebook)

641

for w in w:

642

i = words[w]

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

643

if i < short:

644

phrasebook.append(i)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

645

else:

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

646

# store as two bytes

647

phrasebook.append((i>>8) + short)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

648

phrasebook.append(i&255)

649

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

650

assert getsize(phrasebook) == 1

651

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

652

#

653

# unicode name hash table

# extract names

data = []

for char in unicode.chars:

658

record = unicode.table[char]

659

if record:

660

name = record[1].strip()

661

if name and name[0] != "<":

662

data.append((name, char))

663

664

# the magic number 47 was chosen to minimize the number of

665

# collisions on the current data set. if you like, change it

666

# and see what happens...

667

668

codehash = Hash("code", data, 47)

669

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

670

print("--- Writing", FILE, "...")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

671

672

fp = open(FILE, "w")

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

673

print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)

674

print(file=fp)

675

print("#define NAME_MAXLEN", 256, file=fp)

676

print(file=fp)

677

print("/* lexicon */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

678

Array("lexicon", lexicon).dump(fp, trace)

679

Array("lexicon_offset", lexicon_offset).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

680

681

# split decomposition index table

682

offset1, offset2, shift = splitbins(phrasebook_offset, trace)

683

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

684

print("/* code->name phrasebook */", file=fp)

685

print("#define phrasebook_shift", shift, file=fp)

686

print("#define phrasebook_short", short, file=fp)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

687

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

688

Array("phrasebook", phrasebook).dump(fp, trace)

689

Array("phrasebook_offset1", offset1).dump(fp, trace)

690

Array("phrasebook_offset2", offset2).dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

691

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

692

print("/* name->code dictionary */", file=fp)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

693

codehash.dump(fp, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

fp.close()

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

697

698

def merge_old_version(version, new, old):

699

# Changes to exclusion file not implemented yet

700

if old.exclusions != new.exclusions:

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

701

raise NotImplementedError("exclusions differ")

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

702

703

# In these change records, 0xFF means "no change"

704

bidir_changes = [0xFF]*0x110000

705

category_changes = [0xFF]*0x110000

706

decimal_changes = [0xFF]*0x110000

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

707

mirrored_changes = [0xFF]*0x110000

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

708

# In numeric data, 0 means "no change",

709

# -1 means "did not have a numeric value

710

numeric_changes = [0] * 0x110000

711

# normalization_changes is a list of key-value pairs

712

normalization_changes = []

713

for i in range(0x110000):

714

if new.table[i] is None:

715

# Characters unassigned in the new version ought to

716

# be unassigned in the old one

717

assert old.table[i] is None

718

continue

719

# check characters unassigned in the old version

720

if old.table[i] is None:

721

# category 0 is "unassigned"

722

category_changes[i] = 0

723

continue

724

# check characters that differ

725

if old.table[i] != new.table[i]:

726

for k in range(len(old.table[i])):

727

if old.table[i][k] != new.table[i][k]:

728

value = old.table[i][k]

729

if k == 2:

730

#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]

731

category_changes[i] = CATEGORY_NAMES.index(value)

732

elif k == 4:

733

#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]

734

bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)

735

elif k == 5:

736

#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]

737

# We assume that all normalization changes are in 1:1 mappings

738

assert " " not in value

739

normalization_changes.append((i, value))

740

elif k == 6:

741

#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]

742

# we only support changes where the old value is a single digit

743

assert value in "0123456789"

744

decimal_changes[i] = int(value)

745

elif k == 8:

746

# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]

747

# Since 0 encodes "no change", the old value is better not 0

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

748

if not value:

749

numeric_changes[i] = -1

750

else:

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

751

numeric_changes[i] = float(value)

752

assert numeric_changes[i] not in (0, -1)

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

753

elif k == 9:

754

if value == 'Y':

755

mirrored_changes[i] = '1'

756

else:

757

mirrored_changes[i] = '0'

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

758

elif k == 11:

759

# change to ISO comment, ignore

760

pass

761

elif k == 12:

762

# change to simple uppercase mapping; ignore

763

pass

764

elif k == 13:

765

# change to simple lowercase mapping; ignore

766

pass

767

elif k == 14:

768

# change to simple titlecase mapping; ignore

769

pass

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

770

elif k == 16:

771

# derived property changes; not yet

772

pass

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

773

elif k == 17:

774

# normalization quickchecks are not performed

775

# for older versions

776

pass

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

777

else:

778

class Difference(Exception):pass

Collin Winter

a817e58

2007-08-22 23:05:06 +0000

[diff] [blame]

779

raise Difference(hex(i), k, old.table[i], new.table[i])

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

780

new.changed.append((version, list(zip(bidir_changes, category_changes,

Martin v. Löwis

2008-09-10 14:08:48 +0000

[diff] [blame]

781

decimal_changes, mirrored_changes,

782

numeric_changes)),

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

783

normalization_changes))

Tim Peters

88ca467

2006-03-10 23:39:56 +0000

[diff] [blame]

784

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

785

def open_data(template, version):

786

local = template % ('-'+version,)

787

if not os.path.exists(local):

788

import urllib.request

789

if version == '3.2.0':

790

# irregular url structure

791

url = 'http://www.unicode.org/Public/3.2-Update/' + local

792

else:

793

url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')

794

urllib.request.urlretrieve(url, filename=local)

795

if local.endswith('.txt'):

796

return open(local, encoding='utf-8')

797

else:

798

# Unihan.zip

799

return open(local, 'rb')

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

800

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

801

# --------------------------------------------------------------------

802

# the following support code is taken from the unidb utilities

803

804

805

# load a unicode-data file from disk

806

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

807

class UnicodeData:

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

808

# Record structure:

809

# [ID, name, category, combining, bidi, decomp, (6)

810

# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)

811

# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)

812

# derived-props] (17)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

813

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

814

def __init__(self, version,

815

linebreakprops=False,

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

816

expand=1,

817

cjk_check=True):

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

818

self.changed = []

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

819

file = open_data(UNICODE_DATA, version)

Martin v. Löwis

9def6a3

2002-10-18 16:11:54 +0000

[diff] [blame]

820

table = [None] * 0x110000

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

while 1:

s = file.readline()

if not s:

break

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

825

s = s.strip().split(";")

826

char = int(s[0], 16)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

827

table[char] = s

828

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

829

cjk_ranges_found = []

830

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

831

# expand first-last ranges

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

832

if expand:

833

field = None

Martin v. Löwis

2002-11-24 23:05:09 +0000

[diff] [blame]

834

for i in range(0, 0x110000):

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

835

s = table[i]

836

if s:

837

if s[1][-6:] == "First>":

838

s[1] = ""

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

839

field = s

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

840

elif s[1][-5:] == "Last>":

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

841

if s[1].startswith("<CJK Ideograph"):

842

cjk_ranges_found.append((field[0],

843

s[0]))

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

844

s[1] = ""

845

field = None

846

elif field:

Martin v. Löwis

2006-03-09 23:38:20 +0000

[diff] [blame]

847

f2 = field[:]

848

f2[0] = "%X" % i

849

table[i] = f2

Martin v. Löwis

2010-11-22 09:00:02 +0000

[diff] [blame]

850

if cjk_check and cjk_ranges != cjk_ranges_found:

851

raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

852

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

853

# public attributes

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

854

self.filename = UNICODE_DATA % ''

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

855

self.table = table

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

856

self.chars = list(range(0x110000)) # unicode 3.2

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

857

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

858

file = open_data(COMPOSITION_EXCLUSIONS, version)

Martin v. Löwis

2002-11-23 22:08:15 +0000

[diff] [blame]

self.exclusions = {}

for s in file:

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

char = int(s.split()[0],16)

867

self.exclusions[char] = 1

868

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

869

widths = [None] * 0x110000

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

870

for s in open_data(EASTASIAN_WIDTH, version):

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

s = s.strip()

if not s:

continue

if s[0] == '#':

continue

s = s.split()[0].split(';')

877

if '..' in s[0]:

878

first, last = [int(c, 16) for c in s[0].split('..')]

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

879

chars = list(range(first, last+1))

Hye-Shik Chang

2004-06-02 16:49:17 +0000

[diff] [blame]

880

else:

881

chars = [int(s[0], 16)]

882

for char in chars:

883

widths[char] = s[1]

884

for i in range(0, 0x110000):

885

if table[i] is not None:

886

table[i].append(widths[i])

887

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

888

for i in range(0, 0x110000):

889

if table[i] is not None:

890

table[i].append(set())

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

891

for s in open_data(DERIVED_CORE_PROPERTIES, version):

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

892

s = s.split('#', 1)[0].strip()

if not s:

continue

r, p = s.split(";")

r = r.strip()

p = p.strip()

if ".." in r:

first, last = [int(c, 16) for c in r.split('..')]

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

901

chars = list(range(first, last+1))

Martin v. Löwis

2007-08-14 22:37:03 +0000

[diff] [blame]

else:

chars = [int(r, 16)]

for char in chars:

if table[char]:

# Some properties (e.g. Default_Ignorable_Code_Point)

907

# apply to unassigned code points; ignore them

908

table[char][-1].add(p)

909

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

910

for s in open_data(LINE_BREAK, version):

911

s = s.partition('#')[0]

912

s = [i.strip() for i in s.split(';')]

913

if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:

914

continue

915

if '..' not in s[0]:

916

first = last = int(s[0], 16)

917

else:

918

first, last = [int(c, 16) for c in s[0].split('..')]

919

for char in range(first, last+1):

920

table[char][-1].add('Line_Break')

Florent Xicluna

2010-03-30 19:34:18 +0000

[diff] [blame]

921

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

922

# We only want the quickcheck properties

923

# Format: NF?_QC; Y(es)/N(o)/M(aybe)

924

# Yes is the default, hence only N and M occur

925

# In 3.2.0, the format was different (NF?_NO)

926

# The parsing will incorrectly determine these as

927

# "yes", however, unicodedata.c will not perform quickchecks

928

# for older versions, and no delta records will be created.

929

quickchecks = [0] * 0x110000

930

qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()

931

for s in open_data(DERIVEDNORMALIZATION_PROPS, version):

932

if '#' in s:

933

s = s[:s.index('#')]

934

s = [i.strip() for i in s.split(';')]

935

if len(s) < 2 or s[1] not in qc_order:

936

continue

937

quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No

938

quickcheck_shift = qc_order.index(s[1])*2

939

quickcheck <<= quickcheck_shift

940

if '..' not in s[0]:

941

first = last = int(s[0], 16)

942

else:

943

first, last = [int(c, 16) for c in s[0].split('..')]

944

for char in range(first, last+1):

945

assert not (quickchecks[char]>>quickcheck_shift)&3

946

quickchecks[char] |= quickcheck

947

for i in range(0, 0x110000):

948

if table[i] is not None:

949

table[i].append(quickchecks[i])

Antoine Pitrou

2009-04-27 22:31:40 +0000

[diff] [blame]

950

Martin v. Löwis

2010-10-11 22:42:28 +0000

[diff] [blame]

951

zip = zipfile.ZipFile(open_data(UNIHAN, version))

952

if version == '3.2.0':

953

data = zip.open('Unihan-3.2.0.txt').read()

954

else:

955

data = zip.open('Unihan_NumericValues.txt').read()

956

for line in data.decode("utf-8").splitlines():

Amaury Forgeot d'Arc

2009-10-06 21:03:20 +0000

[diff] [blame]

957

if not line.startswith('U+'):

958

continue

959

code, tag, value = line.split(None, 3)[:3]

960

if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',

961

'kOtherNumeric'):

962

continue

963

value = value.strip().replace(',', '')

964

i = int(code[2:], 16)

965

# Patch the numeric field

966

if table[i] is not None:

967

table[i][8] = value

968

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

969

def uselatin1(self):

970

# restrict character range to ISO Latin 1

Georg Brandl

2008-05-16 17:02:34 +0000

[diff] [blame]

971

self.chars = list(range(256))

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

972

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

973

# hash table tools

974

975

# this is a straight-forward reimplementation of Python's built-in

976

# dictionary type, using a static data structure, and a custom string

977

# hash algorithm.

978

979

def myhash(s, magic):

980

h = 0

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

981

for c in map(ord, s.upper()):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

982

h = (h * magic) + c

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

983

ix = h & 0xff000000

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

984

if ix:

985

h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff

return h

SIZES = [

(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),

990

(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),

991

(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),

992

(2097152,5), (4194304,3), (8388608,33), (16777216,27)

]

class Hash:

def __init__(self, name, data, magic):

997

# turn a (key, value) list into a static hash table structure

998

999

# determine table size

1000

for size, poly in SIZES:

if size > len(data):

poly = size + poly

break

else:

Ezio Melotti

1392500

2011-03-16 11:05:33 +0200

[diff] [blame]

1005

raise AssertionError("ran out of polynomials")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1006

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1007

print(size, "slots in hash table")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1008

1009

table = [None] * size

mask = size-1

n = 0

hash = myhash

# initialize hash table

1018

for key, value in data:

h = hash(key, magic)

i = (~h) & mask

v = table[i]

if v is None:

table[i] = value

continue

incr = (h ^ (h >> 3)) & mask;

if not incr:

incr = mask

while 1:

n = n + 1

i = (i + incr) & mask

v = table[i]

if v is None:

table[i] = value

break

incr = incr << 1

if incr > mask:

incr = incr ^ poly

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1039

print(n, "collisions")

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1040

self.collisions = n

1041

1042

for i in range(len(table)):

if table[i] is None:

table[i] = 0

self.data = Array(name + "_hash", table)

self.magic = magic

self.name = name

self.size = size

self.poly = poly

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1052

def dump(self, file, trace):

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1053

# write data to file, as a C array

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1054

self.data.dump(file, trace)

Fredrik Lundh

2001-01-21 17:01:31 +0000

[diff] [blame]

1055

file.write("#define %s_magic %d\n" % (self.name, self.magic))

1056

file.write("#define %s_size %d\n" % (self.name, self.size))

1057

file.write("#define %s_poly %d\n" % (self.name, self.poly))

1058

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1059

# stuff to deal with arrays of unsigned integers

class Array:

def __init__(self, name, data):

self.name = name

self.data = data

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1067

def dump(self, file, trace=0):

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1068

# write data to file, as a C array

1069

size = getsize(self.data)

Fredrik Lundh

2001-01-21 22:41:08 +0000

[diff] [blame]

1070

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1071

print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1072

file.write("static ")

1073

if size == 1:

1074

file.write("unsigned char")

1075

elif size == 2:

1076

file.write("unsigned short")

1077

else:

1078

file.write("unsigned int")

1079

file.write(" " + self.name + "[] = {\n")

1080

if self.data:

1081

s = " "

1082

for item in self.data:

1083

i = str(item) + ", "

1084

if len(s) + len(i) > 78:

file.write(s + "\n")

s = " " + i

else:

s = s + i

Walter Dörwald

2002-09-11 20:36:02 +0000

[diff] [blame]

1089

if s.strip():

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

file.write(s + "\n")

file.write("};\n\n")

def getsize(data):

# return smallest possible integer size for the given array

maxdata = max(data)

if maxdata < 256:

return 1

elif maxdata < 65536:

return 2

else:

return 4

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1103

def splitbins(t, trace=0):

1104

"""t, trace=0 -> (t1, t2, shift). Split a table to save space.

1105

1106

t is a sequence of ints. This function can be useful to save space if

1107

many of the ints are the same. t1 and t2 are lists of ints, and shift

1108

is an int, chosen to minimize the combined size of t1 and t2 (in C

1109

code), and where for each i in range(len(t)),

1110

t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1111

where mask is a bitmask isolating the last "shift" bits.

1112

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1113

If optional arg trace is non-zero (default zero), progress info

1114

is printed to sys.stderr. The higher the value, the more info

1115

you'll get.

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1116

"""

1117

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1118

if trace:

1119

def dump(t1, t2, shift, bytes):

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1120

print("%d+%d bins at shift %d; %d bytes" % (

1121

len(t1), len(t2), shift, bytes), file=sys.stderr)

1122

print("Size of original table:", len(t)*getsize(t), \

1123

"bytes", file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1124

n = len(t)-1 # last valid index

1125

maxshift = 0 # the most we can shift n and still have something left

if n > 0:

while n >> 1:

n >>= 1

maxshift += 1

del n

Christian Heimes

a37d4c6

2007-12-04 23:02:19 +0000

[diff] [blame]

1131

bytes = sys.maxsize # smallest total size so far

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1132

t = tuple(t) # so slices can be dict keys

1133

for shift in range(maxshift + 1):

1134

t1 = []

1135

t2 = []

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1136

size = 2**shift

1137

bincache = {}

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1138

for i in range(0, len(t), size):

1139

bin = t[i:i+size]

1140

index = bincache.get(bin)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1141

if index is None:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1142

index = len(t2)

1143

bincache[bin] = index

1144

t2.extend(bin)

1145

t1.append(index >> shift)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1146

# determine memory size

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1147

b = len(t1)*getsize(t1) + len(t2)*getsize(t2)

Fredrik Lundh

2000-11-03 20:24:15 +0000

[diff] [blame]

1148

if trace > 1:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1149

dump(t1, t2, shift, b)

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1150

if b < bytes:

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1151

best = t1, t2, shift

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1152

bytes = b

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1153

t1, t2, shift = best

1154

if trace:

Collin Winter

2007-08-03 17:06:41 +0000

[diff] [blame]

1155

print("Best:", end=' ', file=sys.stderr)

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1156

dump(t1, t2, shift, bytes)

1157

if __debug__:

1158

# exhaustively verify that the decomposition is correct

1159

mask = ~((~0) << shift) # i.e., low-bit mask of shift bits

Guido van Rossum

805365e

2007-05-07 22:24:25 +0000

[diff] [blame]

1160

for i in range(len(t)):

Tim Peters

2000-09-25 07:13:41 +0000

[diff] [blame]

1161

assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

1162

return best

Fredrik Lundh

2000-09-24 23:18:31 +0000

[diff] [blame]

1163

1164

if __name__ == "__main__":

Fredrik Lundh