blob: 40af35091d9c604d787d10c6b0889aa8f9f84ead [file] [log] [blame]
Daniel Veillard4255d502002-04-16 15:50:10 +00001#!/usr/bin/python -u
William M. Brackea939082003-11-09 12:45:26 +00002#
3# Original script modified in November 2003 to take advantage of
4# the character-validation range routines, and updated to the
5# current Unicode information (Version 4.0.1)
6#
7# NOTE: there is an 'alias' facility for blocks which are not present in
8# the current release, but are needed for ABI compatibility. This
William M. Brack8763df82003-11-10 15:49:27 +00009# must be accomplished MANUALLY! Please see the comments below under
10# 'blockAliases'
William M. Brackea939082003-11-09 12:45:26 +000011#
Daniel Veillard4255d502002-04-16 15:50:10 +000012import sys
13import string
14import time
15
William M. Brackea939082003-11-09 12:45:26 +000016webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1d5b.html"
17sources = "Blocks-4.0.1d1b.txt UnicodeData-4.0.1d1b.txt"
Daniel Veillard4255d502002-04-16 15:50:10 +000018
William M. Brack8763df82003-11-10 15:49:27 +000019#
20# blockAliases is a small hack - it is used for mapping block names which
21# were were used in the 3.1 release, but are missing or changed in the current
22# release. The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
23blockAliases = []
24blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
25blockAliases.append("Greek:GreekandCoptic")
26blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," +
27 "SupplementaryPrivateUseArea-B")
William M. Brackea939082003-11-09 12:45:26 +000028
29# minTableSize gives the minimum number of ranges which must be present
30# before a range table is produced. If there are less than this
31# number, inline comparisons are generated
32minTableSize = 8
33
34(blockfile, catfile) = string.split(sources)
35
William M. Brackea939082003-11-09 12:45:26 +000036
37#
38# Now process the "blocks" file, reducing it to a dictionary
39# indexed by blockname, containing a tuple with the applicable
40# block range
41#
William M. Brack8763df82003-11-10 15:49:27 +000042BlockNames = {}
Daniel Veillard4255d502002-04-16 15:50:10 +000043try:
William M. Brackea939082003-11-09 12:45:26 +000044 blocks = open(blockfile, "r")
Daniel Veillard4255d502002-04-16 15:50:10 +000045except:
William M. Brackea939082003-11-09 12:45:26 +000046 print "Missing %s, aborting ..." % blockfile
Daniel Veillard4255d502002-04-16 15:50:10 +000047 sys.exit(1)
48
Daniel Veillard4255d502002-04-16 15:50:10 +000049for line in blocks.readlines():
50 if line[0] == '#':
51 continue
52 line = string.strip(line)
53 if line == '':
William M. Brackea939082003-11-09 12:45:26 +000054 continue
Daniel Veillard4255d502002-04-16 15:50:10 +000055 try:
William M. Brackea939082003-11-09 12:45:26 +000056 fields = string.split(line, ';')
57 range = string.strip(fields[0])
58 (start, end) = string.split(range, "..")
59 name = string.strip(fields[1])
60 name = string.replace(name, ' ', '')
Daniel Veillard4255d502002-04-16 15:50:10 +000061 except:
62 print "Failed to process line: %s" % (line)
William M. Brackea939082003-11-09 12:45:26 +000063 continue
William M. Brack8763df82003-11-10 15:49:27 +000064 start = "0x" + start
65 end = "0x" + end
66 try:
67 BlockNames[name].append((start, end))
68 except:
69 BlockNames[name] = [(start, end)]
Daniel Veillard4255d502002-04-16 15:50:10 +000070blocks.close()
71print "Parsed %d blocks descriptions" % (len(BlockNames.keys()))
72
William M. Brack8763df82003-11-10 15:49:27 +000073for block in blockAliases:
74 alias = string.split(block,':')
75 alist = string.split(alias[1],',')
76 for comp in alist:
77 if BlockNames.has_key(comp):
78 if alias[0] not in BlockNames:
79 BlockNames[alias[0]] = []
80 for r in BlockNames[comp]:
81 BlockNames[alias[0]].append(r)
82 else:
83 print "Alias %s: %s not in Blocks" % (alias[0], comp)
84 continue
85
William M. Brackea939082003-11-09 12:45:26 +000086#
87# Next process the Categories file. This is more complex, since
88# the file is in code sequence, and we need to invert it. We use
89# a dictionary with index category-name, with each entry containing
90# all the ranges (codepoints) of that category. Note that category
91# names comprise two parts - the general category, and the "subclass"
92# within that category. Therefore, both "general category" (which is
93# the first character of the 2-character category-name) and the full
94# (2-character) name are entered into this dictionary.
95#
Daniel Veillard4255d502002-04-16 15:50:10 +000096try:
William M. Brackea939082003-11-09 12:45:26 +000097 data = open(catfile, "r")
Daniel Veillard4255d502002-04-16 15:50:10 +000098except:
William M. Brackea939082003-11-09 12:45:26 +000099 print "Missing %s, aborting ..." % catfile
Daniel Veillard4255d502002-04-16 15:50:10 +0000100 sys.exit(1)
101
102nbchar = 0;
103Categories = {}
104for line in data.readlines():
105 if line[0] == '#':
106 continue
107 line = string.strip(line)
108 if line == '':
William M. Brackea939082003-11-09 12:45:26 +0000109 continue
Daniel Veillard4255d502002-04-16 15:50:10 +0000110 try:
William M. Brackea939082003-11-09 12:45:26 +0000111 fields = string.split(line, ';')
112 point = string.strip(fields[0])
113 value = 0
114 while point != '':
115 value = value * 16
116 if point[0] >= '0' and point[0] <= '9':
117 value = value + ord(point[0]) - ord('0')
118 elif point[0] >= 'A' and point[0] <= 'F':
119 value = value + 10 + ord(point[0]) - ord('A')
120 elif point[0] >= 'a' and point[0] <= 'f':
121 value = value + 10 + ord(point[0]) - ord('a')
122 point = point[1:]
123 name = fields[2]
Daniel Veillard4255d502002-04-16 15:50:10 +0000124 except:
125 print "Failed to process line: %s" % (line)
William M. Brackea939082003-11-09 12:45:26 +0000126 continue
Daniel Veillard4255d502002-04-16 15:50:10 +0000127
128 nbchar = nbchar + 1
William M. Brackea939082003-11-09 12:45:26 +0000129 # update entry for "full name"
Daniel Veillard4255d502002-04-16 15:50:10 +0000130 try:
William M. Brackea939082003-11-09 12:45:26 +0000131 Categories[name].append(value)
Daniel Veillard4255d502002-04-16 15:50:10 +0000132 except:
133 try:
William M. Brackea939082003-11-09 12:45:26 +0000134 Categories[name] = [value]
135 except:
136 print "Failed to process line: %s" % (line)
137 # update "general category" name
Daniel Veillard4255d502002-04-16 15:50:10 +0000138 try:
William M. Brackea939082003-11-09 12:45:26 +0000139 Categories[name[0]].append(value)
Daniel Veillard4255d502002-04-16 15:50:10 +0000140 except:
141 try:
William M. Brackea939082003-11-09 12:45:26 +0000142 Categories[name[0]] = [value]
143 except:
144 print "Failed to process line: %s" % (line)
145
Daniel Veillard4255d502002-04-16 15:50:10 +0000146blocks.close()
147print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))
William M. Brackea939082003-11-09 12:45:26 +0000148
149#
150# The data is now all read. Time to process it into a more useful form.
151#
152# reduce the number list into ranges
Daniel Veillard4255d502002-04-16 15:50:10 +0000153for cat in Categories.keys():
154 list = Categories[cat]
155 start = -1
156 prev = -1
157 end = -1
158 ranges = []
159 for val in list:
160 if start == -1:
William M. Brackea939082003-11-09 12:45:26 +0000161 start = val
162 prev = val
163 continue
164 elif val == prev + 1:
165 prev = val
166 continue
167 elif prev == start:
168 ranges.append((prev, prev))
169 start = val
170 prev = val
171 continue
172 else:
173 ranges.append((start, prev))
174 start = val
175 prev = val
176 continue
Daniel Veillard4255d502002-04-16 15:50:10 +0000177 if prev == start:
178 ranges.append((prev, prev))
179 else:
180 ranges.append((start, prev))
181 Categories[cat] = ranges
William M. Brackea939082003-11-09 12:45:26 +0000182
183#
184# Assure all data is in alphabetic order, since we will be doing binary
185# searches on the tables.
186#
187bkeys = BlockNames.keys()
188bkeys.sort()
189
190ckeys = Categories.keys()
191ckeys.sort()
192
Daniel Veillard4255d502002-04-16 15:50:10 +0000193#
194# Generate the resulting files
195#
196try:
William M. Brackea939082003-11-09 12:45:26 +0000197 header = open("include/libxml/xmlunicode.h", "w")
Daniel Veillard4255d502002-04-16 15:50:10 +0000198except:
William M. Brackea939082003-11-09 12:45:26 +0000199 print "Failed to open include/libxml/xmlunicode.h"
Daniel Veillard4255d502002-04-16 15:50:10 +0000200 sys.exit(1)
201
202try:
203 output = open("xmlunicode.c", "w")
204except:
205 print "Failed to open xmlunicode.c"
206 sys.exit(1)
207
208date = time.asctime(time.localtime(time.time()))
209
210header.write(
211"""/*
Daniel Veillardbe586972003-11-18 20:56:51 +0000212 * Summary: Unicode character APIs
213 * Description: API for the Unicode character APIs
Daniel Veillard4255d502002-04-16 15:50:10 +0000214 *
215 * This file is automatically generated from the
216 * UCS description files of the Unicode Character Database
William M. Brackea939082003-11-09 12:45:26 +0000217 * %s
Daniel Veillard4255d502002-04-16 15:50:10 +0000218 * using the genUnicode.py Python script.
219 *
220 * Generation date: %s
221 * Sources: %s
Daniel Veillardbe586972003-11-18 20:56:51 +0000222 * Author: Daniel Veillard
Daniel Veillard4255d502002-04-16 15:50:10 +0000223 */
224
225#ifndef __XML_UNICODE_H__
226#define __XML_UNICODE_H__
227
Igor Zlatkovic76874e42003-08-25 09:05:12 +0000228#include <libxml/xmlversion.h>
229
Daniel Veillard4255d502002-04-16 15:50:10 +0000230#ifdef __cplusplus
231extern "C" {
232#endif
233
William M. Brackea939082003-11-09 12:45:26 +0000234""" % (webpage, date, sources));
235
Daniel Veillard4255d502002-04-16 15:50:10 +0000236output.write(
237"""/*
238 * xmlunicode.c: this module implements the Unicode character APIs
239 *
240 * This file is automatically generated from the
241 * UCS description files of the Unicode Character Database
William M. Brackea939082003-11-09 12:45:26 +0000242 * %s
Daniel Veillard4255d502002-04-16 15:50:10 +0000243 * using the genUnicode.py Python script.
244 *
245 * Generation date: %s
246 * Sources: %s
247 * Daniel Veillard <veillard@redhat.com>
248 */
249
250#define IN_LIBXML
251#include "libxml.h"
252
253#ifdef LIBXML_UNICODE_ENABLED
254
255#include <string.h>
256#include <libxml/xmlversion.h>
257#include <libxml/xmlunicode.h>
William M. Brackea939082003-11-09 12:45:26 +0000258#include <libxml/chvalid.h>
Daniel Veillard4255d502002-04-16 15:50:10 +0000259
William M. Brackea939082003-11-09 12:45:26 +0000260typedef int (xmlIntFunc)(int); /* just to keep one's mind untwisted */
Daniel Veillard4255d502002-04-16 15:50:10 +0000261
William M. Brackea939082003-11-09 12:45:26 +0000262typedef struct {
263 const char *rangename;
264 xmlIntFunc *func;
265} xmlUnicodeRange;
266
267typedef struct {
268 xmlUnicodeRange *table;
269 int numentries;
270} xmlUnicodeNameTable;
271
272
273static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname);
274
275static xmlUnicodeRange xmlUnicodeBlocks[] = {
276""" % (webpage, date, sources));
277
278flag = 0
279for block in bkeys:
280 name = string.replace(block, '-', '')
281 if flag:
282 output.write(',\n')
283 else:
284 flag = 1
William M. Brack8763df82003-11-10 15:49:27 +0000285 output.write(' {"%s", xmlUCSIs%s}' % (block, name))
William M. Brackea939082003-11-09 12:45:26 +0000286output.write('};\n\n')
287
288output.write('static xmlUnicodeRange xmlUnicodeCats[] = {\n')
289flag = 0;
290for name in ckeys:
291 if flag:
292 output.write(',\n')
293 else:
294 flag = 1
295 output.write(' {"%s", xmlUCSIsCat%s}' % (name, name))
296output.write('};\n\n')
297
298#
299# For any categories with more than minTableSize ranges we generate
300# a range table suitable for xmlCharInRange
301#
302for name in ckeys:
303 if len(Categories[name]) > minTableSize:
304 numshort = 0
305 numlong = 0
306 ranges = Categories[name]
307 sptr = "NULL"
308 lptr = "NULL"
309 for range in ranges:
310 (low, high) = range
311 if high < 0x10000:
312 if numshort == 0:
313 pline = "static xmlChSRange xml%sS[] = {" % name
314 sptr = "xml%sS" % name
315 else:
316 pline += ", "
317 numshort += 1
318 else:
319 if numlong == 0:
320 if numshort > 0:
321 output.write(pline + " };\n")
322 pline = "static xmlChLRange xml%sL[] = {" % name
323 lptr = "xml%sL" % name
324 else:
325 pline += ", "
326 numlong += 1
327 if len(pline) > 60:
328 output.write(pline + "\n")
329 pline = " "
330 pline += "{%s, %s}" % (hex(low), hex(high))
331 output.write(pline + " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
332 % (name, numshort, numlong, sptr, lptr))
333
334
335output.write(
336"""xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
337xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
338
339/**
340 * xmlUnicodeLookup:
341 * @tptr: pointer to the name table
342 * @name: name to be found
343 *
344 * binary table lookup for user-supplied name
345 *
346 * Returns pointer to range function if found, otherwise NULL
347 */
348static xmlIntFunc
349*xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname) {
350 int low, high, mid, cmp;
351 xmlUnicodeRange *sptr;
352
353 low = 0;
354 high = tptr->numentries - 1;
355 sptr = tptr->table;
356 while (low <= high) {
357 mid = (low + high) / 2;
358 if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)
359 return (sptr[mid].func);
360 if (cmp < 0)
361 high = mid - 1;
362 else
363 low = mid + 1;
364 }
365 return (NULL);
366}
367
368""" % (len(BlockNames), len(Categories)) )
369
370for block in bkeys:
Daniel Veillard4255d502002-04-16 15:50:10 +0000371 name = string.replace(block, '-', '')
Igor Zlatkovic76874e42003-08-25 09:05:12 +0000372 header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name)
Daniel Veillard4255d502002-04-16 15:50:10 +0000373 output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
374 output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
375 (block))
376 output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
William M. Brack8763df82003-11-10 15:49:27 +0000377 output.write("int\nxmlUCSIs%s(int code) {\n return(" % name)
378 flag = 0
379 for (start, end) in BlockNames[block]:
380 if flag:
381 output.write(" ||\n ")
382 else:
383 flag = 1
384 output.write("((code >= %s) && (code <= %s))" % (start, end))
385 output.write(");\n}\n\n")
Daniel Veillard4255d502002-04-16 15:50:10 +0000386
William M. Brackea939082003-11-09 12:45:26 +0000387header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n")
388output.write(
389"""/**
390 * xmlUCSIsBlock:
391 * @code: UCS code point
392 * @block: UCS block name
393 *
394 * Check whether the character is part of the UCS Block
395 *
396 * Returns 1 if true, 0 if false and -1 on unknown block
397 */
398int
399xmlUCSIsBlock(int code, const char *block) {
400 xmlIntFunc *func;
Daniel Veillard4255d502002-04-16 15:50:10 +0000401
William M. Brackea939082003-11-09 12:45:26 +0000402 func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
403 if (func == NULL)
404 return (-1);
405 return (func(code));
406}
Daniel Veillard4255d502002-04-16 15:50:10 +0000407
William M. Brackea939082003-11-09 12:45:26 +0000408""")
409
410for name in ckeys:
Daniel Veillard4255d502002-04-16 15:50:10 +0000411 ranges = Categories[name]
Igor Zlatkovic76874e42003-08-25 09:05:12 +0000412 header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name)
Daniel Veillard4255d502002-04-16 15:50:10 +0000413 output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
414 output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
415 (name))
416 output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
417 output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
William M. Brackea939082003-11-09 12:45:26 +0000418 if len(Categories[name]) > minTableSize:
419 output.write(" return(xmlCharInRange((unsigned int)code, &xml%sG)"
420 % name)
421 else:
422 start = 1
423 for range in ranges:
424 (begin, end) = range;
425 if start:
426 output.write(" return(");
427 start = 0
428 else:
429 output.write(" ||\n ");
430 if (begin == end):
431 output.write("(code == %s)" % (hex(begin)))
432 else:
433 output.write("((code >= %s) && (code <= %s))" % (
434 hex(begin), hex(end)))
Daniel Veillard4255d502002-04-16 15:50:10 +0000435 output.write(");\n}\n\n")
436
William M. Brackea939082003-11-09 12:45:26 +0000437header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n")
438output.write(
439"""/**
440 * xmlUCSIsCat:
441 * @code: UCS code point
442 * @cat: UCS Category name
443 *
444 * Check whether the character is part of the UCS Category
445 *
446 * Returns 1 if true, 0 if false and -1 on unknown category
447 */
448int
449xmlUCSIsCat(int code, const char *cat) {
450 xmlIntFunc *func;
451
452 func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
453 if (func == NULL)
454 return (-1);
455 return (func(code));
456}
457
William M. Brackea939082003-11-09 12:45:26 +0000458
459#endif /* LIBXML_UNICODE_ENABLED */
460""")
Daniel Veillard4255d502002-04-16 15:50:10 +0000461
462header.write("""
463#ifdef __cplusplus
464}
465#endif
466#endif /* __XML_UNICODE_H__ */
467""");
William M. Brackea939082003-11-09 12:45:26 +0000468
Daniel Veillard4255d502002-04-16 15:50:10 +0000469header.close()
470output.close()