blob: d613148bfb887e81eb865b999ecc6025ef110567 [file] [log] [blame]
William M. Brack68aca052003-10-11 15:22:13 +00001#!/usr/bin/python -u
2#
3# Portions of this script have been (shamelessly) stolen from the
4# prior work of Daniel Veillard (genUnicode.py)
5#
6# I, however, take full credit for any bugs, errors or difficulties :-)
7#
8# William Brack
9# October 2003
10#
William M. Brack871611b2003-10-18 04:53:14 +000011# 18 October 2003
12# Modified to maintain binary compatibility with previous library versions
13# by adding a suffix 'Q' ('quick') to the macro generated for the original,
14# function, and adding generation of a function (with the original name) which
15# instantiates the macro.
16#
William M. Brack68aca052003-10-11 15:22:13 +000017
18import sys
19import string
20import time
21
22#
William M. Brack68aca052003-10-11 15:22:13 +000023# A routine to take a list of yes/no (1, 0) values and turn it
24# into a list of ranges. This will later be used to determine whether
25# to generate single-byte lookup tables, or inline comparisons
26#
27def makeRange(lst):
28 ret = []
29 pos = 0
30 while pos < len(lst):
31 try: # index generates exception if not present
32 s = lst[pos:].index(1) # look for start of next range
33 except:
34 break # if no more, finished
35 pos += s # pointer to start of possible range
36 try:
37 e = lst[pos:].index(0) # look for end of range
38 e += pos
39 except: # if no end, set to end of list
40 e = len(lst)
41 ret.append((pos, e-1)) # append range tuple to list
42 pos = e + 1 # ready to check for next range
43 return ret
44
45sources = "chvalid.def" # input filename
46
47# minTableSize gives the minimum number of ranges which must be present
48# before a 256-byte lookup table is produced. If there are less than this
49# number, a macro with inline comparisons is generated
50minTableSize = 6
51
William M. Brack68aca052003-10-11 15:22:13 +000052# dictionary of functions, key=name, element contains char-map and range-list
53Functs = {}
54
55state = 0
56
57try:
58 defines = open("chvalid.def", "r")
59except:
60 print "Missing chvalid.def, aborting ..."
61 sys.exit(1)
62
63#
64# The lines in the .def file have three types:-
65# name: Defines a new function block
66# ur: Defines individual or ranges of unicode values
67# end: Indicates the end of the function block
68#
69# These lines are processed below.
70#
71for line in defines.readlines():
72 # ignore blank lines, or lines beginning with '#'
73 if line[0] == '#':
74 continue
75 line = string.strip(line)
76 if line == '':
77 continue
78 # split line into space-separated fields, then split on type
79 try:
80 fields = string.split(line, ' ')
81 #
82 # name line:
83 # validate any previous function block already ended
84 # validate this function not already defined
85 # initialize an entry in the function dicitonary
86 # including a mask table with no values yet defined
87 #
88 if fields[0] == 'name':
89 name = fields[1]
90 if state != 0:
91 print "'name' %s found before previous name" \
92 "completed" % (fields[1])
93 continue
94 state = 1
95 if Functs.has_key(name):
96 print "name '%s' already present - may give" \
97 " wrong results" % (name)
98 else:
99 # dict entry with two list elements (chdata, rangedata)
100 Functs[name] = [ [], [] ]
101 for v in range(256):
102 Functs[name][0].append(0)
103 #
104 # end line:
105 # validate there was a preceding function name line
106 # set state to show no current function active
107 #
108 elif fields[0] == 'end':
109 if state == 0:
110 print "'end' found outside of function block"
111 continue
112 state = 0
113
114 #
115 # ur line:
116 # validate function has been defined
117 # process remaining fields on the line, which may be either
118 # individual unicode values or ranges of values
119 #
120 elif fields[0] == 'ur':
121 if state != 1:
122 raise ValidationError, "'ur' found outside of 'name' block"
123 for el in fields[1:]:
124 pos = string.find(el, '..')
125 # pos <=0 means not a range, so must be individual value
126 if pos <= 0:
127 # cheap handling of hex or decimal values
128 if el[0:2] == '0x':
129 value = int(el[2:],16)
130 elif el[0] == "'":
131 value = ord(el[1])
132 else:
133 value = int(el)
134 if ((value < 0) | (value > 0x1fffff)):
135 raise ValidationError, 'Illegal value (%s) in ch for'\
136 ' name %s' % (el,name)
137 # for ur we have only ranges (makes things simpler),
138 # so convert val to range
139 currange = (value, value)
140 # pos > 0 means this is a range, so isolate/validate
141 # the interval
142 else:
143 # split the range into it's first-val, last-val
144 (first, last) = string.split(el, "..")
145 # convert values from text into binary
146 if first[0:2] == '0x':
147 start = int(first[2:],16)
148 elif first[0] == "'":
149 start = ord(first[1])
150 else:
151 start = int(first)
152 if last[0:2] == '0x':
153 end = int(last[2:],16)
154 elif last[0] == "'":
155 end = ord(last[1])
156 else:
157 end = int(last)
158 if (start < 0) | (end > 0x1fffff) | (start > end):
159 raise ValidationError, "Invalid range '%s'" % el
160 currange = (start, end)
161 # common path - 'currange' has the range, now take care of it
162 # We split on single-byte values vs. multibyte
163 if currange[1] < 0x100: # single-byte
164 for ch in range(currange[0],currange[1]+1):
165 # validate that value not previously defined
166 if Functs[name][0][ch]:
167 msg = "Duplicate ch value '%s' for name '%s'" % (el, name)
168 raise ValidationError, msg
169 Functs[name][0][ch] = 1
170 else: # multi-byte
William M. Brack68aca052003-10-11 15:22:13 +0000171 if currange in Functs[name][1]:
172 raise ValidationError, "range already defined in" \
173 " function"
174 else:
175 Functs[name][1].append(currange)
176
177 except:
178 print "Failed to process line: %s" % (line)
179 raise
180#
181# At this point, the entire definition file has been processed. Now we
182# enter the output phase, where we generate the two files chvalid.c and'
183# chvalid.h
184#
185# To do this, we first output the 'static' data (heading, fixed
186# definitions, etc.), then output the 'dynamic' data (the results
187# of the above processing), and finally output closing 'static' data
188# (e.g. the subroutine to process the ranges)
189#
190
191#
192# Generate the headings:
193#
194try:
Daniel Veillard1a993962003-10-11 20:58:06 +0000195 header = open("include/libxml/chvalid.h", "w")
William M. Brack68aca052003-10-11 15:22:13 +0000196except:
Daniel Veillard1a993962003-10-11 20:58:06 +0000197 print "Failed to open include/libxml/chvalid.h"
William M. Brack68aca052003-10-11 15:22:13 +0000198 sys.exit(1)
199
200try:
201 output = open("chvalid.c", "w")
202except:
203 print "Failed to open chvalid.c"
204 sys.exit(1)
205
206date = time.asctime(time.localtime(time.time()))
207
208header.write(
209"""/*
Daniel Veillardbe586972003-11-18 20:56:51 +0000210 * Summary: Unicode character range checking
211 * Description: this module exports interfaces for the character
212 * range validation APIs
William M. Brack68aca052003-10-11 15:22:13 +0000213 *
214 * This file is automatically generated from the cvs source
215 * definition files using the genChRanges.py Python script
216 *
217 * Generation date: %s
218 * Sources: %s
Daniel Veillardbe586972003-11-18 20:56:51 +0000219 * Author: William Brack <wbrack@mmm.com.hk>
William M. Brack68aca052003-10-11 15:22:13 +0000220 */
221
222#ifndef __XML_CHVALID_H__
223#define __XML_CHVALID_H__
224
William M. Brack871611b2003-10-18 04:53:14 +0000225#include <libxml/xmlversion.h>
226
William M. Brack68aca052003-10-11 15:22:13 +0000227#ifdef __cplusplus
228extern "C" {
229#endif
230
231/*
232 * Define our typedefs and structures
233 *
234 */
235typedef struct _xmlChSRange xmlChSRange;
236typedef xmlChSRange *xmlChSRangePtr;
237struct _xmlChSRange {
238 unsigned short low;
239 unsigned short high;
240};
241
242typedef struct _xmlChLRange xmlChLRange;
243typedef xmlChLRange *xmlChLRangePtr;
244struct _xmlChLRange {
William M. Brack196b3882003-10-18 12:42:41 +0000245 unsigned int low;
246 unsigned int high;
William M. Brack68aca052003-10-11 15:22:13 +0000247};
248
249typedef struct _xmlChRangeGroup xmlChRangeGroup;
250typedef xmlChRangeGroup *xmlChRangeGroupPtr;
251struct _xmlChRangeGroup {
252 int nbShortRange;
253 int nbLongRange;
254 xmlChSRangePtr shortRange; /* points to an array of ranges */
255 xmlChLRangePtr longRange;
256};
257
William M. Brack196b3882003-10-18 12:42:41 +0000258/**
259 * Range checking routine
260 */
William M. Brack871611b2003-10-18 04:53:14 +0000261XMLPUBFUN int XMLCALL
262 xmlCharInRange(unsigned int val, const xmlChRangeGroupPtr group);
William M. Brack68aca052003-10-11 15:22:13 +0000263
264""" % (date, sources));
265output.write(
266"""/*
267 * chvalid.c: this module implements the character range
268 * validation APIs
269 *
270 * This file is automatically generated from the cvs source
271 * definition files using the genChRanges.py Python script
272 *
273 * Generation date: %s
274 * Sources: %s
275 * William Brack <wbrack@mmm.com.hk>
276 */
277
Daniel Veillardfca7d832003-10-22 08:44:26 +0000278#define IN_LIBXML
279#include "libxml.h"
William M. Brack6819a4e2003-10-11 15:59:36 +0000280#include <libxml/chvalid.h>
William M. Brack68aca052003-10-11 15:22:13 +0000281
282/*
283 * The initial tables ({func_name}_tab) are used to validate whether a
284 * single-byte character is within the specified group. Each table
285 * contains 256 bytes, with each byte representing one of the 256
286 * possible characters. If the table byte is set, the character is
287 * allowed.
288 *
289 */
290""" % (date, sources));
291
292#
293# Now output the generated data.
294# We try to produce the best execution times. Tests have shown that validation
295# with direct table lookup is, when there are a "small" number of valid items,
296# still not as fast as a sequence of inline compares. So, if the single-byte
297# portion of a range has a "small" number of ranges, we output a macro for inline
298# compares, otherwise we output a 256-byte table and a macro to use it.
299#
300
301fkeys = Functs.keys() # Dictionary of all defined functions
302fkeys.sort() # Put some order to our output
303
304for f in fkeys:
305
306# First we convert the specified single-byte values into a group of ranges.
307# If the total number of such ranges is less than minTableSize, we generate
308# an inline macro for direct comparisons; if greater, we generate a lookup
309# table.
310 if max(Functs[f][0]) > 0: # only check if at least one entry
311 rangeTable = makeRange(Functs[f][0])
312 numRanges = len(rangeTable)
313 if numRanges >= minTableSize: # table is worthwhile
William M. Brack871611b2003-10-18 04:53:14 +0000314 header.write("XMLPUBVAR unsigned char %s_tab[256];\n" % f)
William M. Brack196b3882003-10-18 12:42:41 +0000315 header.write("""
316/**
317 * %s_ch:
318 * @c: char to validate
319 *
320 * Automatically generated by genChRanges.py
321 */
322""" % f)
William M. Brack68aca052003-10-11 15:22:13 +0000323 header.write("#define %s_ch(c)\t(%s_tab[(c)])\n" % (f, f))
324
325 # write the constant data to the code file
326 output.write("unsigned char %s_tab[256] = {\n" % f)
327 pline = " "
328 for n in range(255):
329 pline += " 0x%02x," % Functs[f][0][n]
330 if len(pline) > 72:
331 output.write(pline + "\n")
332 pline = " "
333 output.write(pline + " 0x%02x };\n\n" % Functs[f][0][255])
334
335 else: # inline check is used
336 # first another little optimisation - if space is present,
337 # put it at the front of the list so it is checked first
338 try:
339 ix = rangeTable.remove((0x20, 0x20))
340 rangeTable.insert(0, (0x20, 0x20))
341 except:
342 pass
William M. Brack68aca052003-10-11 15:22:13 +0000343 firstFlag = 1
William M. Brack196b3882003-10-18 12:42:41 +0000344
345 header.write("""
346/**
347 * %s_ch:
William M. Brack196b3882003-10-18 12:42:41 +0000348 * @c: char to validate
349 *
350 * Automatically generated by genChRanges.py
351 */
352""" % f)
William M. Brackc4b81892003-10-12 10:42:46 +0000353 # okay, I'm tired of the messy lineup - let's automate it!
354 pline = "#define %s_ch(c)" % f
355 # 'ntab' is number of tabs needed to position to col. 33 from name end
356 ntab = 4 - (len(pline)) / 8
357 if ntab < 0:
358 ntab = 0
359 just = ""
360 for i in range(ntab):
361 just += "\t"
362 pline = pline + just + "("
William M. Brack68aca052003-10-11 15:22:13 +0000363 for rg in rangeTable:
364 if not firstFlag:
William M. Brackc4b81892003-10-12 10:42:46 +0000365 pline += " || \\\n\t\t\t\t "
William M. Brack68aca052003-10-11 15:22:13 +0000366 else:
367 firstFlag = 0
368 if rg[0] == rg[1]: # single value - check equal
William M. Brackc4b81892003-10-12 10:42:46 +0000369 pline += "((c) == 0x%x)" % rg[0]
370 else: # value range
William M. Brack272693c2003-11-14 16:20:34 +0000371 # since we are doing char, also change range ending in 0xff
372 if rg[1] != 0xff:
373 pline += "((0x%x <= (c)) &&" % rg[0]
374 pline += " ((c) <= 0x%x))" % rg[1]
375 else:
376 pline += " (0x%x <= (c))" % rg[0]
William M. Brack68aca052003-10-11 15:22:13 +0000377 pline += ")\n"
378 header.write(pline)
379
William M. Brack196b3882003-10-18 12:42:41 +0000380 header.write("""
381/**
382 * %sQ:
383 * @c: char to validate
384 *
385 * Automatically generated by genChRanges.py
386 */
387""" % f)
William M. Brack871611b2003-10-18 04:53:14 +0000388 pline = "#define %sQ(c)" % f
William M. Brackc4b81892003-10-12 10:42:46 +0000389 ntab = 4 - (len(pline)) / 8
390 if ntab < 0:
391 ntab = 0
392 just = ""
393 for i in range(ntab):
394 just += "\t"
395 header.write(pline + just + "(((c) < 0x100) ? \\\n\t\t\t\t ")
William M. Brack68aca052003-10-11 15:22:13 +0000396 if max(Functs[f][0]) > 0:
397 header.write("%s_ch((c)) :" % f)
398 else:
399 header.write("0 :")
400
401 # if no ranges defined, value invalid if >= 0x100
William M. Brackc4b81892003-10-12 10:42:46 +0000402 numRanges = len(Functs[f][1])
403 if numRanges == 0:
William M. Brack68aca052003-10-11 15:22:13 +0000404 header.write(" 0)\n\n")
405 else:
William M. Brackc4b81892003-10-12 10:42:46 +0000406 if numRanges >= minTableSize:
407 header.write(" \\\n\t\t\t\t xmlCharInRange((c), &%sGroup))\n\n" % f)
408 else: # if < minTableSize, generate inline code
409 firstFlag = 1
410 for rg in Functs[f][1]:
411 if not firstFlag:
412 pline += " || \\\n\t\t\t\t "
413 else:
414 firstFlag = 0
415 pline = "\\\n\t\t\t\t("
416 if rg[0] == rg[1]: # single value - check equal
417 pline += "((c) == 0x%x)" % rg[0]
418 else: # value range
419 pline += "((0x%x <= (c)) &&" % rg[0]
420 pline += " ((c) <= 0x%x))" % rg[1]
421 pline += "))\n\n"
422 header.write(pline)
423
William M. Brack68aca052003-10-11 15:22:13 +0000424
425 if len(Functs[f][1]) > 0:
William M. Brack871611b2003-10-18 04:53:14 +0000426 header.write("XMLPUBVAR xmlChRangeGroup %sGroup;\n" % f)
William M. Brack68aca052003-10-11 15:22:13 +0000427
428
429#
430# Next we do the unicode ranges
431#
432
433for f in fkeys:
434 if len(Functs[f][1]) > 0: # only generate if unicode ranges present
435 rangeTable = Functs[f][1]
436 rangeTable.sort() # ascending tuple sequence
437 numShort = 0
438 numLong = 0
439 for rg in rangeTable:
440 if rg[1] < 0x10000: # if short value
441 if numShort == 0: # first occurence
442 pline = "static xmlChSRange %s_srng[] = { " % f
443 else:
444 pline += ", "
445 numShort += 1
446 if len(pline) > 60:
447 output.write(pline + "\n")
448 pline = " "
449 pline += "{0x%x, 0x%x}" % (rg[0], rg[1])
450 else: # if long value
451 if numLong == 0: # first occurence
452 if numShort > 0: # if there were shorts, finish them off
453 output.write(pline + "};\n")
454 pline = "static xmlChLRange %s_lrng[] = { " % f
455 else:
456 pline += ", "
457 numLong += 1
458 if len(pline) > 60:
459 output.write(pline + "\n")
460 pline = " "
461 pline += "{0x%x, 0x%x}" % (rg[0], rg[1])
462 output.write(pline + "};\n") # finish off last group
463
William M. Brackc4b81892003-10-12 10:42:46 +0000464 pline = "xmlChRangeGroup %sGroup =\n\t{%d, %d, " % (f, numShort, numLong)
William M. Brack68aca052003-10-11 15:22:13 +0000465 if numShort > 0:
466 pline += "%s_srng" % f
William M. Brackc4b81892003-10-12 10:42:46 +0000467 else:
468 pline += "(xmlChSRangePtr)0"
William M. Brack68aca052003-10-11 15:22:13 +0000469 if numLong > 0:
470 pline += ", %s_lrng" % f
William M. Brackc4b81892003-10-12 10:42:46 +0000471 else:
472 pline += ", (xmlChLRangePtr)0"
William M. Brack68aca052003-10-11 15:22:13 +0000473
474 output.write(pline + "};\n\n")
William M. Brack68aca052003-10-11 15:22:13 +0000475
476output.write(
477"""
William M. Brack196b3882003-10-18 12:42:41 +0000478/**
479 * xmlCharInRange:
480 * @val: character to be validated
481 * @rptr: pointer to range to be used to validate
482 *
483 * Does a binary search of the range table to determine if char
484 * is valid
485 *
486 * Returns: true if character valid, false otherwise
487 */
William M. Brack68aca052003-10-11 15:22:13 +0000488int
Daniel Veillard2bd43222003-10-22 08:51:21 +0000489xmlCharInRange (unsigned int val, const xmlChRangeGroupPtr rptr) {
William M. Brack68aca052003-10-11 15:22:13 +0000490 int low, high, mid;
491 xmlChSRangePtr sptr;
492 xmlChLRangePtr lptr;
493 if (val < 0x10000) { /* is val in 'short' or 'long' array? */
494 if (rptr->nbShortRange == 0)
495 return 0;
496 low = 0;
William M. Brackc4b81892003-10-12 10:42:46 +0000497 high = rptr->nbShortRange - 1;
William M. Brack68aca052003-10-11 15:22:13 +0000498 sptr = rptr->shortRange;
499 while (low <= high) {
500 mid = (low + high) / 2;
William M. Brackc4b81892003-10-12 10:42:46 +0000501 if ((unsigned short) val < sptr[mid].low) {
William M. Brack68aca052003-10-11 15:22:13 +0000502 high = mid - 1;
William M. Brackc4b81892003-10-12 10:42:46 +0000503 } else {
504 if ((unsigned short) val > sptr[mid].high) {
505 low = mid + 1;
506 } else {
507 return 1;
508 }
509 }
William M. Brack68aca052003-10-11 15:22:13 +0000510 }
511 } else {
William M. Brackc4b81892003-10-12 10:42:46 +0000512 if (rptr->nbLongRange == 0) {
William M. Brack68aca052003-10-11 15:22:13 +0000513 return 0;
William M. Brackc4b81892003-10-12 10:42:46 +0000514 }
William M. Brack68aca052003-10-11 15:22:13 +0000515 low = 0;
William M. Brackc4b81892003-10-12 10:42:46 +0000516 high = rptr->nbLongRange - 1;
William M. Brack68aca052003-10-11 15:22:13 +0000517 lptr = rptr->longRange;
518 while (low <= high) {
519 mid = (low + high) / 2;
William M. Brackc4b81892003-10-12 10:42:46 +0000520 if (val < lptr[mid].low) {
William M. Brack68aca052003-10-11 15:22:13 +0000521 high = mid - 1;
William M. Brackc4b81892003-10-12 10:42:46 +0000522 } else {
523 if (val > lptr[mid].high) {
524 low = mid + 1;
525 } else {
526 return 1;
527 }
528 }
William M. Brack68aca052003-10-11 15:22:13 +0000529 }
530 }
531 return 0;
532}
533
534""");
535
William M. Brack871611b2003-10-18 04:53:14 +0000536#
537# finally, generate the ABI compatibility functions
538#
539for f in fkeys:
William M. Brack196b3882003-10-18 12:42:41 +0000540 output.write("""
541/**
542 * %s:
543 * @ch: character to validate
544 *
William M. Brackb1d53162003-11-18 06:54:40 +0000545 * This function is DEPRECATED.
546""" % f);
547 if max(Functs[f][0]) > 0:
548 output.write(" * Use %s_ch or %sQ instead" % (f, f))
549 else:
550 output.write(" * Use %sQ instead" % f)
551 output.write("""
William M. Brack196b3882003-10-18 12:42:41 +0000552 *
553 * Returns true if argument valid, false otherwise
554 */
William M. Brackb1d53162003-11-18 06:54:40 +0000555""")
William M. Brack871611b2003-10-18 04:53:14 +0000556 output.write("int\n%s(unsigned int ch) {\n return(%sQ(ch));\n}\n\n" % (f,f))
557 header.write("XMLPUBFUN int XMLCALL\n\t\t%s(unsigned int ch);\n" % f);
558#
559# Run complete - write trailers and close the output files
560#
561
562header.write("""
563#ifdef __cplusplus
564}
565#endif
566#endif /* __XML_CHVALID_H__ */
567""");
568
569header.close()
William M. Brack68aca052003-10-11 15:22:13 +0000570output.close()
William M. Brack871611b2003-10-18 04:53:14 +0000571